PdfFocusCOCROptions Class |
Class allowing to attach any OCR library to PDF Focus .Net.
Inheritance Hierarchy Namespace: SautinSoftAssembly: SautinSoft.PdfFocus (in SautinSoft.PdfFocus.dll) Version: 2024.3.28
Syntax The PdfFocusCOCROptions type exposes the following members.
Constructors Properties | Name | Description |
---|
| Method |
Method to perform OCR (any 3rd party). We offer free library from Nicomsoft: https://www.nicomsoft.com/nicomsoft-ocr-sdk-is-freeware-now.
|
| Mode |
Set the OCR mode: Disable (default), All images, Automatic.
|
TopExample Perform OCR using free Tesseract SDK in C#
using System.IO;
using SautinSoft;
using System;
namespace Example
{
class Program
{
static void Main(string[] args)
{
LoadScannedPdf();
}
static void LoadScannedPdf()
{
string inpFile = Path.GetFullPath(@"..\..\..\scan.pdf");
string outFile = "Result.docx";
PdfFocus f = new PdfFocus();
f.OCROptions.Mode = PdfFocus.COCROptions.eOCRMode.AllImages;
f.OCROptions.Method += PerformOCRTesseract;
f.OpenPdf(inpFile);
bool result = false;
if (f.PageCount > 0)
{
result = f.ToWord(outFile) == 0;
}
if (result)
{
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
}
else
Console.WriteLine("Conversion failed!");
}
public static byte[] PerformOCRTesseract(byte[] image)
{
string tesseractLanguages = "eng";
string tesseractData = Path.GetFullPath(@"..\..\..\tessdata\");
string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
bool skipImages = true;
try
{
using (Tesseract.IResultRenderer renderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, skipImages))
{
using (renderer.BeginDocument("Serachablepdf"))
{
using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
{
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
using (MemoryStream msImg = new MemoryStream(image))
{
System.Drawing.Image imgWithText = System.Drawing.Image.FromStream(msImg);
for (int i = 0; i < imgWithText.GetFrameCount(System.Drawing.Imaging.FrameDimension.Page); i++)
{
imgWithText.SelectActiveFrame(System.Drawing.Imaging.FrameDimension.Page, i);
using (MemoryStream ms = new MemoryStream())
{
imgWithText.Save(ms, System.Drawing.Imaging.ImageFormat.Png);
byte[] imgBytes = ms.ToArray();
using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
{
using (var page = engine.Process(img, "Serachablepdf"))
{
renderer.AddPage(page);
}
}
}
}
}
}
}
}
return File.ReadAllBytes(tempFile + ".pdf");
}
catch (Exception e)
{
Console.WriteLine();
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
Console.ReadKey();
throw new Exception("Error Tesseract: " + e.Message);
}
finally
{
if (File.Exists(tempFile + ".pdf"))
File.Delete(tempFile + ".pdf");
}
}
}
}
Perform OCR using free Tesseract SDK in VB.Net
Option Infer On
Imports System.IO
Imports SautinSoft
Imports System
Namespace Example
Friend Class Program
Shared Sub Main(ByVal args() As String)
LoadScannedPdf()
End Sub
Private Shared Sub LoadScannedPdf()
Dim inpFile As String = "..\..\..\scan.pdf"
Dim outFile As String = "Result.docx"
Dim f As New PdfFocus()
f.OCROptions.Mode = PdfFocus.COCROptions.eOCRMode.AllImages
f.OCROptions.Method = AddressOf PerformOCRTesseract
f.OpenPdf(inpFile)
Dim result As Boolean = False
If f.PageCount > 0 Then
result = f.ToWord(outFile) = 0
End If
If result Then
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
Else
Console.WriteLine("Conversion failed!")
End If
End Sub
Public Shared Function PerformOCRTesseract(ByVal image() As Byte) As Byte()
Dim tesseractLanguages As String = "eng"
Dim tesseractData As String = Path.GetFullPath("..\..\..\tessdata\")
Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
Dim skipImages As Boolean = True
Try
Using renderer As Tesseract.IResultRenderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, skipImages)
Using renderer.BeginDocument("Serachablepdf")
Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
Using msImg As New MemoryStream(image)
Dim imgWithText As System.Drawing.Image = System.Drawing.Image.FromStream(msImg)
Dim i As Integer = 0
Do While i < imgWithText.GetFrameCount(System.Drawing.Imaging.FrameDimension.Page)
imgWithText.SelectActiveFrame(System.Drawing.Imaging.FrameDimension.Page, i)
Using ms As New MemoryStream()
imgWithText.Save(ms, System.Drawing.Imaging.ImageFormat.Png)
Dim imgBytes() As Byte = ms.ToArray()
Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
Using page = engine.Process(img, "Serachablepdf")
renderer.AddPage(page)
End Using
End Using
End Using
i += 1
Loop
End Using
End Using
End Using
End Using
Return File.ReadAllBytes(tempFile & ".pdf")
Catch e As Exception
Console.WriteLine()
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
Console.ReadKey()
Throw New Exception("Error Tesseract: " & e.Message)
Finally
If File.Exists(tempFile & ".pdf") Then
File.Delete(tempFile & ".pdf")
End If
End Try
End Function
End Class
End Namespace
See Also