PDF Focus .Net - Convert PDF to All with OCR engine in C#

Convert PDF to All with OCR engine in C#.

Since version 7.0, PDF Focus .Net can work with OCR. To perform OCR we'll use free OCR library by Nicomsoft (https://www.nicomsoft.com).
The library is freeware and can be used in commercial application.

You need download Nicomsoft OCR SDK from: free_NSOCR_v70_build885_full.exe



Further install it on your PC or server-side.



How to use SautinSoft.PdfFocus with Optical Character Recognition (OCR).

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using SautinSoft;
using NSOCR_NameSpace;
using System.Drawing.Imaging;


namespace Sample
{
    public class PdfConverter
    {
        internal NSOCRLib.NSOCRClass NsOCR;
        internal int CfgObj = 0;
        internal int OcrObj = 0;
        internal int ImgObj = 0;
        internal int ScanObj = 0;
        internal int SvrObj = 0;
        internal bool OCRCreated = false;

        /// <summary>
        /// Converts PDF to DOCX, RTF, HTML, Text with OCR engine.
        /// </summary>
        public void ConvertPdfToAllWithOCR(string pdfPath)
        {
            // To perform OCR we'll use free OCR library by Nicomsoft.
            // https://www.nicomsoft.com/products/ocr/download/
            // The library is freeware and can be used in commercial application.
            // Also you have to insert this key:  AB2A4DD5FF2A.
            NsOCR = new NSOCRLib.NSOCRClass();
            NsOCR.Engine_SetLicenseKey("AB2A4DD5FF2A"); //required for licensed version only
            NsOCR.Engine_InitializeAdvanced(out CfgObj, out OcrObj, out ImgObj);

            SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();
            f.OCROptions.Method = PerformOCR;
            f.OCROptions.Mode = PdfFocus.COCROptions.eOCRMode.AllImages;
            f.WordOptions.KeepCharScaleAndSpacing = false;

            string pdfFile = pdfPath;
            string outFile = String.Empty;

            f.OpenPdf(pdfFile);
            if (f.PageCount > 0)
            {
                // To Docx.
                outFile = "Result.docx";
                f.WordOptions.Format = PdfFocus.CWordOptions.eWordDocument.Docx;
                if (f.ToWord(outFile) == 0)
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });

                // To HTML.
                outFile = "Result.html";
                f.HtmlOptions.KeepCharScaleAndSpacing = false;
                if (f.ToHtml(outFile) == 0)
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
            }
            else
            {
                Console.WriteLine("Error: {0}!", f.Exception.Message);
                Console.ReadLine();
            }
        }
        private byte[] PerformOCR(System.Drawing.Image scanned)
        {
            try
            {
                int res = 0;

                try
                {
                    NsOCR.Cfg_SetOption(CfgObj, TNSOCR.BT_DEFAULT, "Languages/English", "1");

                    Array imgArray = null;
                    using (MemoryStream ms = new MemoryStream())
                    {
                        scanned.Save(ms, ImageFormat.Png);
                        ms.Flush();
                        imgArray = ms.ToArray();
                    }
                    res = NsOCR.Img_LoadFromMemory(ImgObj, ref imgArray, imgArray.Length);
                    if (res > TNSOCR.ERROR_FIRST)
                        return null;

                    NsOCR.Svr_Create(CfgObj, TNSOCR.SVR_FORMAT_PDF, out SvrObj);
                    NsOCR.Svr_NewDocument(SvrObj);

                    res = NsOCR.Img_OCR(ImgObj, TNSOCR.OCRSTEP_FIRST, TNSOCR.OCRSTEP_LAST, TNSOCR.OCRFLAG_NONE);
                    if (res > TNSOCR.ERROR_FIRST)
                        return null;

                    res = NsOCR.Svr_AddPage(SvrObj, ImgObj, TNSOCR.FMT_EXACTCOPY);
                    if (res > TNSOCR.ERROR_FIRST) return null;

                    Array outPdf = null;
                    NsOCR.Svr_SaveToMemory(SvrObj, out outPdf);

                    return (byte[])outPdf;

                }
                finally
                {

                }
            }
            catch
            {
                return null;
            }
        }
    }
    class Sample
    {
        static void Main(string[] args)
        {
            // To perform OCR we'll use free OCR library by Nicomsoft.
            // https://www.nicomsoft.com/products/ocr/download/
            // The library is freeware and can be used in commercial application.

            PdfConverter converter = new PdfConverter();
            string inpFile = Path.GetFullPath(@"..\..\scan.pdf");
            converter.ConvertPdfToAllWithOCR(inpFile);

            // You are trying to compile this code sample and see the errors: 
            // NSOCRClass: Engine_SetLicenseKey
            // PdfFocus: OCROptions
            //
            // 1. Download Nicomsoft OCR SDK from: http://www.nicomsoft.com/files/ocr/free_NSOCR_v70_build885_full.exe
            // 2. Install it on your PC or server-side.
            // 3. Launch code sample again and enjoy! 

            // Please, read the full manual - How to use PDF Focus .Net with OCR (Readme.html)
            // IMPORTANT: PDF Focus .Net supports OCR since version 7.0
        }
    }
}

Download.

        
            Imports System
Imports System.Collections.Generic
Imports System.Linq
Imports System.Text
Imports System.Threading.Tasks
Imports System.IO
Imports SautinSoft
Imports NSOCR_NameSpace
Imports System.Drawing.Imaging


Namespace Sample
    Public Class PdfConverter
        Friend NsOCR As NSOCRLib.NSOCRClass
        Friend CfgObj As Integer = 0
        Friend OcrObj As Integer = 0
        Friend ImgObj As Integer = 0
        Friend ScanObj As Integer = 0
        Friend SvrObj As Integer = 0
        Friend OCRCreated As Boolean = False

        ''' <summary>
        ''' Converts PDF to DOCX, RTF, HTML, Text with OCR engine.
        ''' </summary>
        Public Sub ConvertPdfToAllWithOCR(ByVal pdfPath As String)
            ' To perform OCR we'll use free OCR library by Nicomsoft.
            ' https://www.nicomsoft.com/products/ocr/download/
            ' The library is freeware and can be used in commercial application.
            ' Also you have to insert this key:  AB2A4DD5FF2A.
            NsOCR = New NSOCRLib.NSOCRClass()
            NsOCR.Engine_SetLicenseKey("AB2A4DD5FF2A") 'required for licensed version only
            NsOCR.Engine_InitializeAdvanced(CfgObj, OcrObj, ImgObj)

            Dim f As New SautinSoft.PdfFocus()
            f.OCROptions.Method = AddressOf PerformOCR
            f.OCROptions.Mode = PdfFocus.COCROptions.eOCRMode.AllImages
            f.WordOptions.KeepCharScaleAndSpacing = False

            Dim pdfFile As String = pdfPath
            Dim outFile As String = String.Empty

            f.OpenPdf(pdfFile)
            If f.PageCount > 0 Then
                ' To Docx.
                outFile = "Result.docx"
                f.WordOptions.Format = PdfFocus.CWordOptions.eWordDocument.Docx
                If f.ToWord(outFile) = 0 Then
                    System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
                End If

                ' To HTML.
                outFile = "Result.html"
                f.HtmlOptions.KeepCharScaleAndSpacing = False
                If f.ToHtml(outFile) = 0 Then
                    System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
                End If
            Else
                Console.WriteLine("Error: {0}!", f.Exception.Message)
                Console.ReadLine()
            End If
        End Sub
        Private Function PerformOCR(ByVal scanned As System.Drawing.Image) As Byte()
            Try
                Dim res As Integer = 0

                Try
                    NsOCR.Cfg_SetOption(CfgObj, TNSOCR.BT_DEFAULT, "Languages/English", "1")

                    Dim imgArray As Array = Nothing
                    Using ms As New MemoryStream()
                        scanned.Save(ms, ImageFormat.Png)
                        ms.Flush()
                        imgArray = ms.ToArray()
                    End Using
                    res = NsOCR.Img_LoadFromMemory(ImgObj, imgArray, imgArray.Length)
                    If res > TNSOCR.ERROR_FIRST Then
                        Return Nothing
                    End If

                    NsOCR.Svr_Create(CfgObj, TNSOCR.SVR_FORMAT_PDF, SvrObj)
                    NsOCR.Svr_NewDocument(SvrObj)

                    res = NsOCR.Img_OCR(ImgObj, TNSOCR.OCRSTEP_FIRST, TNSOCR.OCRSTEP_LAST, TNSOCR.OCRFLAG_NONE)
                    If res > TNSOCR.ERROR_FIRST Then
                        Return Nothing
                    End If

                    res = NsOCR.Svr_AddPage(SvrObj, ImgObj, TNSOCR.FMT_EXACTCOPY)
                    If res > TNSOCR.ERROR_FIRST Then
                        Return Nothing
                    End If

                    Dim outPdf As Array = Nothing
                    NsOCR.Svr_SaveToMemory(SvrObj, outPdf)

                    Return CType(outPdf, Byte())

                Finally

                End Try
            Catch
                Return Nothing
            End Try
        End Function
    End Class
    Friend Class Sample
        Shared Sub Main(ByVal args() As String)
            ' To perform OCR we'll use free OCR library by Nicomsoft.
            ' https://www.nicomsoft.com/products/ocr/download/
            ' The library is freeware and can be used in commercial application.

            Dim converter As New PdfConverter()
            Dim inpFile As String = Path.GetFullPath("..\scan.pdf")
            converter.ConvertPdfToAllWithOCR(inpFile)

            ' You are trying to compile this code sample and see the errors: 
            ' NSOCRClass: Engine_SetLicenseKey
            ' PdfFocus: OCROptions
            '
            ' 1. Download Nicomsoft OCR SDK from: http://www.nicomsoft.com/files/ocr/free_NSOCR_v70_build885_full.exe
            ' 2. Install it on your PC or server-side.
            ' 3. Launch code sample again and enjoy! 

            ' Please, read the full manual - How to use PDF Focus .Net with OCR (Readme.html)
            ' IMPORTANT: PDF Focus .Net supports OCR since version 7.0
        End Sub
    End Class
End Namespace

Download.


If anyone needs a code sample in C#, VB.Net, ASP.Net etc "How to convert PDF to Docx, Rtf, HTML, Excel" and other formats, email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page). We'll help you certainly!