How to Perform OCR Operations on PDF Documents inside .NET Applications
This technical tip shows how .NET developers can perform OCR operations on PDF documents inside .NET Applications. Aspose.OCR APIs can only accept images to perform OCR operation on them. If the requirement is to perform OCR on PDF documents then two Aspose APIs will be used to achieve the ultimate goal, that is; Aspose.Pdf APIs convert the PDF pages to images and Aspose.OCR APIs perform the OCR operation on the extracted/converted images. This article demonstrates the usage of Aspose.Pdf for .NET & Aspose.OCR for .NET to perform the OCR operation on PDF documents.
//your code here...///The sample code below shows how to perform OCR operations on PDF documents
//[C# Code Sample]
//Create an instance of Document to load the PDF
var pdfDocument = new Aspose.Pdf.Document("D:/sample.pdf");
//Create an instance of OcrEngine for recognition
var ocrEngine = new Aspose.OCR.OcrEngine();
//Iterate over the pages of PDF
for (int pageCount = 1; pageCount <= pdfDocument.Pages.Count; pageCount++)
{
//Creating a MemoryStream to hold the image temporarily
using (var imageStream = new System.IO.MemoryStream())
{
//Create Resolution object with DPI value
var resolution = new Aspose.Pdf.Devices.Resolution(300);
//Create JPEG device with specified attributes (Width, Height, Resolution, Quality)
//where Quality [0-100], 100 is Maximum
var jpegDevice = new Aspose.Pdf.Devices.JpegDevice(resolution, 100);
//Convert a particular page and save the image to stream
jpegDevice.Process(pdfDocument.Pages[pageCount], imageStream);
imageStream.Position = 0;
//Set Image property of OcrEngine to the stream obtained from previous step
ocrEngine.Image = Aspose.OCR.ImageStream.FromStream(imageStream, Aspose.OCR.ImageStreamFormat.Jpg);
//Perform OCR operation on one page at a time
if (ocrEngine.Process())
{
Console.WriteLine(ocrEngine.Text);
}
}
}
//[VB.NET Code Sample]
'Create an instance of Document to load the PDF
Dim pdfDocument = New Aspose.Pdf.Document("D:/Disclosure(SDK).pdf")
'Create an instance of OcrEngine for recoginition
Dim ocrEngine = New Aspose.OCR.OcrEngine()
'Iterate over the pages of PDF
For pageCount As Integer = 1 To pdfDocument.Pages.Count
'Creating a MemoryStream to hold the image temporarily
Using imageStream = New System.IO.MemoryStream()
'Create Resolution object with DPI value
Dim resolution = New Aspose.Pdf.Devices.Resolution(300)
'Create JPEG device with specified attributes (Width, Height, Resolution, Quality)
'where Quality [0-100], 100 is Maximum
Dim jpegDevice = New Aspose.Pdf.Devices.JpegDevice(resolution, 100)
'Convert a particular page and save the image to stream
jpegDevice.Process(pdfDocument.Pages(pageCount), imageStream)
imageStream.Position = 0
'Set Image property of OcrEngine to the stream obtained from previous step
ocrEngine.Image = Aspose.OCR.ImageStream.FromStream(imageStream, Aspose.OCR.ImageStreamFormat.Jpg)
'Perform OCR operation on one page at a time
If ocrEngine.Process() Then
Console.WriteLine(ocrEngine.Text)
End If
End Using
Next pageCount
Url: http://www.aspose.com/.net/ocr-component.aspx
Language: C# | User: Sheraz Khan | Created: Nov 25, 2015 | Tags: Perform OCR Operations on PDF perform OCR operation on images convert PDF pages to images .NET PDF Component OCR operation on extracted images .NET OCR Component