Document .Net - Full text search in PDF, DOCX, RTF and HTML files using C# and VB .Net

How to launch full text search in PDF, DOCX, RTF and HTML files
using C# and VB .Net


Here we'll show you how to use full text search in the specific directory including subdirectories.
Using regular expressions, we'll find - "video" (video, VIDEO, ViDeO, etc) in all files (DOCX, RTF, PDF and HTML) inside the specified directory and output the results to the Console.


Complete code

using System;
using System.IO;
using System.Collections.Generic;
using SautinSoft.Document;
using System.Drawing;
using System.Drawing.Imaging;
using System.Linq;
using System.Text.RegularExpressions;



namespace Sample
{
    class Sample
    {

        static void Main(string[] args)
        {
            FullTextSearching(@"c:\Test\Full text searching", "video");
        }

        /// <summary>
        /// This sample shows how to launch full text search in the specific directory.
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/document/examples/full-text-searching-in-documents-net-csharp-vb.php
        /// </remarks>                 
        public static void FullTextSearching(string searchPath, string searchText)
        {
            DirectoryInfo searchDir = new DirectoryInfo(searchPath);            
            List<string> supportedFiles = new List<string>();

            // 1. Find theS files to make search.
            // Specify to make the search only in *.docx, *.rtf, *.pdf and *.html files,
            // including subdirectories.
            foreach (string file in Directory.GetFiles(searchDir.FullName, "*.*", SearchOption.AllDirectories))
            {
                string ext = Path.GetExtension(file).ToLower();

                if (ext == ".docx" || ext == ".pdf" || ext == ".html" || ext == ".rtf")
                    supportedFiles.Add(file);
            }

            // 2. Perform the text search in the each file using a loop.
            // We'll search the word "video" in the each and count how many times the file contains it.
            Console.WriteLine($"The results for \"{searchText}\":");

            int totalFiles = 0, totalMatches = 0;
            foreach (string file in supportedFiles)
            {
                DocumentCore dc = DocumentCore.Load(file);
                totalFiles++;
                Regex regex = new Regex($"\\b({searchText})\\b", RegexOptions.IgnoreCase);

                // Show also subfolder if we aren't in the root folder.
                DirectoryInfo dirInfo = new DirectoryInfo(Path.GetDirectoryName(file));
                string fileName = String.Empty;

                if (dirInfo.FullName.TrimEnd(new char[] { '\\' }) != searchDir.FullName.TrimEnd(new char[] { '\\' }))
                    fileName = file.Substring( searchPath.Length, file.Length  - searchPath.Length);
                else
                    // We are in the root folder.
                    fileName = Path.GetFileName(file);

                int matches = dc.Content.Find(regex).Count();
                totalMatches += matches;

                Console.WriteLine($"{totalFiles:D3} from {supportedFiles.Count} {fileName} - {matches} matches.");
            }
            Console.WriteLine($"\nSearching finished. {supportedFiles.Count} file(s) has been processed. Total matches: {totalMatches}.");
            Console.WriteLine("Press any key ...");
            Console.ReadKey();
        }
    }
}

Download.

        
            Imports System
Imports System.IO
Imports System.Collections.Generic
Imports SautinSoft.Document
Imports System.Drawing
Imports System.Linq
Imports System.Text.RegularExpressions



Namespace Sample
	Friend Class Sample

		Shared Sub Main(ByVal args() As String)
			FullTextSearching("c:\Test\Full text searching", "video")
		End Sub

		''' <summary>
		''' This sample shows how to launch full text search in the specific directory.
		''' </summary>
		''' <remarks>
		''' Details: https://sautinsoft.com/products/document/examples/full-text-searching-in-documents-net-csharp-vb.php
		''' </remarks>                 
		Public Shared Sub FullTextSearching(ByVal searchPath As String, ByVal searchText As String)
			Dim searchDir As New DirectoryInfo(searchPath)
			Dim supportedFiles As New List(Of String)()

			' 1. Find theS files to make search.
			' Specify to make the search only in *.docx, *.rtf, *.pdf and *.html files,
			' including subdirectories.
			For Each file As String In Directory.GetFiles(searchDir.FullName, "*.*", SearchOption.AllDirectories)
				Dim ext As String = Path.GetExtension(file).ToLower()

				If ext = ".docx" OrElse ext = ".pdf" OrElse ext = ".html" OrElse ext = ".rtf" Then
					supportedFiles.Add(file)
				End If
			Next file

			' 2. Perform the text search in the each file using a loop.
			' We'll search the word "video" in the each and count how many times the file contains it.
			Console.WriteLine($"The results for ""{searchText}"":")

			Dim totalFiles As Integer = 0, totalMatches As Integer = 0
			For Each file As String In supportedFiles
				Dim dc As DocumentCore = DocumentCore.Load(file)
				totalFiles += 1
				Dim regex As New Regex($"\b({searchText})\b", RegexOptions.IgnoreCase)

				' Show also subfolder if we aren't in the root folder.
				Dim dirInfo As New DirectoryInfo(Path.GetDirectoryName(file))
				Dim fileName As String = String.Empty

				If dirInfo.FullName.TrimEnd(New Char() { "\"c }) <> searchDir.FullName.TrimEnd(New Char() { "\"c }) Then
					fileName = file.Substring(searchPath.Length, file.Length - searchPath.Length)
				Else
					' We are in the root folder.
					fileName = Path.GetFileName(file)
				End If

				Dim matches As Integer = dc.Content.Find(regex).Count()
				totalMatches += matches

				Console.WriteLine($"{totalFiles:D3} from {supportedFiles.Count} {fileName} - {matches} matches.")
			Next file
            Console.WriteLine($"Searching finished. {supportedFiles.Count} file(s) has been processed. Total matches: {totalMatches}.")
            Console.WriteLine("Press any key ...")
			Console.ReadKey()
		End Sub
	End Class
End Namespace

Download.


If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:



Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.

© SautinSoft 2002 - 2019