Click or drag to resize

OCROptions Class

Represents a class that stores loading options for Portable Document Format (PDF).
Inheritance Hierarchy
SystemObject
  SautinSoft.DocumentOCROptions

Namespace: SautinSoft.Document
Assembly: SautinSoft.Document (in SautinSoft.Document.dll) Version: 2024.1.24
Syntax
public sealed class OCROptions

The OCROptions type exposes the following members.

Constructors
 NameDescription
Public methodOCROptions A constructor for working with OCR.
Top
Properties
 NameDescription
Public propertyMethod Method to perform OCR (any 3rd party). We offer free library from Nicomsoft: https://www.nicomsoft.com/nicomsoft-ocr-sdk-is-freeware-now.
Public propertyOCRMode Gets or sets OCR mode. Default value: Disabled.
Top
Example

See Developer Guide: Recognize an image using Tesseract (free OCR library)

Recognize an image using Tesseract (free OCR library) using C#
using System.IO;
using SautinSoft.Document;
using System;
using SkiaSharp;

namespace Example
{
    class Program
    {
        static void Main(string[] args)
        {
            // Get your free 30-day key here:   
            // https://sautinsoft.com/start-for-free/

            RecognizeImage();
        }

        /// <summary>
        /// Recognize an image using Tesseract (free OCR library) and save the result as DOCX document.
        /// </summary>
        /// <remarks>
        /// Details: https://www.sautinsoft.com/products/document/help/net/developer-guide/ocr-image-using-tesseract-and-save-as-docx-net-csharp-vb.php
        /// </remarks>
        static void RecognizeImage()
        {
            // Here we'll recognize an image (perform OCR) containing a text on English, Russian and Vietnamese.
            // Next save the OCR result as a new DOCX document.

            // First steps:

            // 1. Download data files for English, Russian and Vietnamese languages.
            // Please download the files: eng.traineddata, rus.traineddata and vie.traineddata.
            // From here (good and fast): https://github.com/tesseract-ocr/tessdata_fast
            // or (best and slow): https://github.com/tesseract-ocr/tessdata_best

            // 2. Copy the files: eng.traineddata, rus.traineddata and vie.traineddata to
            // the folder "tessdata" in the Project root.

            // 3. Be sure that the folder "tessdata" also contains "pdf.ttf" file.

            // Let's start:
            string inpFile = @"..\..\..\image.png";
            string outFile = "Result1.docx";

            ImageLoadOptions lo = new ImageLoadOptions();
            lo.OCROptions.OCRMode = OCRMode.Enabled;

            // You can specify all Tesseract parameters inside the method PerformOCR.
            lo.OCROptions.Method = PerformOCRTesseract;
            DocumentCore dc = DocumentCore.Load(inpFile, lo);

            // Make all text visible after Tesseract OCR (change font color to Black).
            // The matter is that Tesseract returns OCR result PDF document with invisible text.
            // But with help of Document .Net, we can change the text color, 
            // char scaling and spacing to desired.
            foreach (Run r in dc.GetChildElements(true, ElementType.Run))
            {
                r.CharacterFormat.FontColor = SautinSoft.Document.Color.Black;
                r.CharacterFormat.Scaling = 100;
                r.CharacterFormat.Spacing = 0;
                r.CharacterFormat.Size = 12;
            }

            // Change the page size and add page margins.
            Section section = dc.Sections[0];
            section.PageSetup.PaperType = PaperType.Letter;
            section.PageSetup.Orientation = Orientation.Landscape;
            double m = LengthUnitConverter.Convert(5, LengthUnit.Millimeter, LengthUnit.Point);
            section.PageSetup.PageMargins = new PageMargins() { Top = m, Left = m, Right = m, Bottom = m };
            dc.Save(outFile);

            // Open the result for demonstration purposes.
            System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
        }
        public static byte[] PerformOCRTesseract(byte[] image)
        {
            // Specify that Tesseract use three 3 languages: English, Russian and Vietnamese.
            string tesseractLanguages = "rus+eng+vie";


            // A path to a folder which contains languages data files and font file "pdf.ttf".
            // Language data files can be found here:
            // Good and fast: https://github.com/tesseract-ocr/tessdata_fast
            // or
            // Best and slow: https://github.com/tesseract-ocr/tessdata_best
            // Also this folder must have write permissions.
            string tesseractData = Path.GetFullPath(@"..\..\..\tessdata\");

            // A path for a temporary PDF file (because Tesseract returns OCR result as PDF document)
            string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());

            try
            {
                using (Tesseract.IResultRenderer renderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, true))
                {
                    using (renderer.BeginDocument("Serachablepdf"))
                    {
                        using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
                        {
                            engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
                            using (MemoryStream msImg = new MemoryStream(image))
                            {
                                SKBitmap imgWithText = SKBitmap.Decode(msImg);
                                    using (MemoryStream ms = new MemoryStream())
                                    {
                                        imgWithText.Encode(ms, SKEncodedImageFormat.Png, 100);
                                        byte[] imgBytes = ms.ToArray();
                                        using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
                                        {
                                            using (var page = engine.Process(img, "Serachablepdf"))
                                            {
                                                renderer.AddPage(page);
                                            }
                                        }
                                    }
                            }
                        }
                    }
                }

                PdfLoadOptions pl = new PdfLoadOptions();
                pl.ShowInvisibleText = true;
                // 'Disabled' - Never load embedded fonts in PDF. Use the fonts with the same name installed at the system or similar by font metrics.
                // 'Enabled' - Always load embedded fonts in PDF.
                // 'Auto' - Load only embedded fonts missing in the system. In other case, use the system fonts.
                pl.PreserveEmbeddedFonts = PropertyState.Disabled;
                pl.ConversionMode = PdfConversionMode.Continuous;

                DocumentCore dc = DocumentCore.Load(File.OpenRead(tempFile + @".pdf"), pl);

                byte[] returnPdf;
                using (MemoryStream ms = new MemoryStream())
                {
                    PdfSaveOptions ps = new PdfSaveOptions();
                    dc.Save(ms, ps);
                    returnPdf = ms.ToArray();
                }
                return returnPdf;
            }
            catch (Exception e)
            {
                Console.WriteLine();
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
                Console.ReadKey();
                throw new Exception("Error Tesseract: " + e.Message);
            }
            finally
            {
                if (File.Exists(tempFile + ".pdf"))
                    File.Delete(tempFile + ".pdf");
            }
        }
    }
}
Recognize an image using Tesseract (free OCR library) using VB.Net
Imports System
Imports System.IO
Imports SautinSoft.Document
Imports SkiaSharp

Module Sample
    Sub Main()
        RecognizeImage()
    End Sub
        ''' Get your free 30-day key here:   
        ''' https://sautinsoft.com/start-for-free/
    ''' <summary>
    ''' Recognize an image using Tesseract (free OCR library) and save the result as DOCX document.
    ''' </summary>
    ''' <remarks>
    ''' Details: https://www.sautinsoft.com/products/document/help/net/developer-guide/ocr-image-using-tesseract-and-save-as-docx-net-csharp-vb.php
    ''' </remarks>
    Sub RecognizeImage()
        ' Here we'll recognize an image (perform OCR) containing a text on English, Russian and Vietnamese.
        ' Next save the OCR result as a new DOCX document.

        ' First steps:

        ' 1. Download data files for English, Russian and Vietnamese languages.
        ' Please download the files: eng.traineddata, rus.traineddata and vie.traineddata.
        ' From here (good and fast): https://github.com/tesseract-ocr/tessdata_fast
        ' or (best and slow): https://github.com/tesseract-ocr/tessdata_best

        ' 2. Copy the files: eng.traineddata, rus.traineddata and vie.traineddata to
        ' the folder "tessdata" in the Project root.

        ' 3. Be sure that the folder "tessdata" also contains "pdf.ttf" file.

        ' Let's start:
        Dim inpFile As String = "..\..\..\image.png"
        Dim outFile As String = "Result.docx"

        Dim lo As New ImageLoadOptions()
        lo.OCROptions.OCRMode = OCRMode.Enabled

        ' You can specify all Tesseract parameters inside the method PerformOCR.
        lo.OCROptions.Method = AddressOf PerformOCRTesseract
        Dim dc As DocumentCore = DocumentCore.Load(inpFile, lo)

        ' Make all text visible after Tesseract OCR (change font color to Black).
        ' The matter is that Tesseract returns OCR result PDF document with invisible text.
        ' But with help of Document .Net, we can change the text color, 
        ' char scaling and spacing to desired.
        For Each r As Run In dc.GetChildElements(True, ElementType.Run)
            r.CharacterFormat.FontColor = SautinSoft.Document.Color.Black
            r.CharacterFormat.Scaling = 100
            r.CharacterFormat.Spacing = 0
            r.CharacterFormat.Size = 12
        Next r

        ' Change the page size and add page margins.
        Dim section As Section = dc.Sections(0)
        section.PageSetup.PaperType = PaperType.Letter
        section.PageSetup.Orientation = Orientation.Landscape
        Dim m As Double = LengthUnitConverter.Convert(5, LengthUnit.Millimeter, LengthUnit.Point)
        section.PageSetup.PageMargins = New PageMargins() With {
                .Top = m,
                .Left = m,
                .Right = m,
                .Bottom = m
            }


        dc.Save(outFile)

        ' Open the result for demonstration purposes.
        System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
    End Sub
    Public Function PerformOCRTesseract(ByVal image() As Byte) As Byte()
        ' Specify that Tesseract use three 3 languages: English, Russian and Vietnamese.
        Dim tesseractLanguages As String = "rus+eng+vie"

        ' A path to a folder which contains languages data files and font file "pdf.ttf".
        ' Language data files can be found here:
        ' Good and fast: https://github.com/tesseract-ocr/tessdata_fast
        ' or
        ' Best and slow: https://github.com/tesseract-ocr/tessdata_best
        ' Also this folder must have write permissions.
        Dim tesseractData As String = Path.GetFullPath("..\..\..\tessdata\")

        ' A path for a temporary PDF file (because Tesseract returns OCR result as PDF document)
        Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())

        Try
            Using renderer As Tesseract.IResultRenderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, True)
                Using renderer.BeginDocument("Serachablepdf")
                    Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
                        engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
                        Using msImg As New MemoryStream(image)
                            Dim imgWithText As SKBitmap = SKBitmap.Decode(msImg)
                            Using ms As New MemoryStream()
                                imgWithText.Encode(ms, SKEncodedImageFormat.Png, 100)
                                Dim imgBytes() As Byte = ms.ToArray()
                                Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
                                    Using page = engine.Process(img, "Serachablepdf")
                                        renderer.AddPage(page)
                                    End Using
                                End Using
                            End Using
                        End Using
                    End Using
                End Using
            End Using

            Dim pl As New PdfLoadOptions()
            pl.ShowInvisibleText = True
            ' 'Disabled' - Never load embedded fonts in PDF. Use the fonts with the same name installed at the system or similar by font metrics.
            ' 'Enabled' - Always load embedded fonts in PDF.
            ' 'Auto' - Load only embedded fonts missing in the system. In other case, use the system fonts.            
            pl.PreserveEmbeddedFonts = PropertyState.Disabled
            pl.ConversionMode = PdfConversionMode.Continuous

            Dim dc As DocumentCore = DocumentCore.Load(File.OpenRead(tempFile & ".pdf"), pl)

            Dim returnPdf() As Byte
            Using ms As New MemoryStream()
                Dim ps As New PdfSaveOptions()
                dc.Save(ms, ps)
                returnPdf = ms.ToArray()
            End Using
            Return returnPdf
        Catch e As Exception
            Console.WriteLine()
            Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
            Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
            Console.ReadKey()
            Throw New Exception("Error Tesseract: " & e.Message)
        Finally
            If File.Exists(tempFile & ".pdf") Then
                File.Delete(tempFile & ".pdf")
            End If
        End Try
    End Function
End Module
See Also