Some test text!

menu
search
chevron_right .NET Framework samples

VB use OCR to make searchable PDFs and extract text

Sample VB code shows how to use the PDFTron OCR module on scanned documents in multiple languages. The OCR module can make searchable PDFs and extract scanned text for further indexing.

To run this sample, get started with a free trial of PDFTron SDK.

'---------------------------------------------------------------------------------------
' Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
' Consult legal.txt regarding legal and license information.     
'---------------------------------------------------------------------------------------
Imports System

Imports pdftron
Imports pdftron.Common
Imports pdftron.SDF
Imports pdftron.PDF

' <summary>
'---------------------------------------------------------------------------------------
' The following sample illustrates how to use OCR module
'---------------------------------------------------------------------------------------
' </summary>
Module OCRTestVB
    Dim pdfNetLoader As PDFNetLoader
    Sub New()
        pdfNetLoader = pdftron.PDFNetLoader.Instance()
    End Sub

    ' The main entry point for the application.
    Sub Main()

        ' The first step in every application using PDFNet is to initialize the 
        ' library and set the path to common PDF resources. The library is usually 
        ' initialized only once, but calling Initialize() multiple times is also fine.
        PDFNet.Initialize()

        ' Can optionally set path to the OCR module
        PDFNet.AddResourceSearchPath("../../../../../Lib/")
        If Not OCRModule.IsModuleAvailable() Then
            Console.WriteLine("")
            Console.WriteLine("Unable to run OCRTest: PDFTron SDK OCR module not available.")
            Console.WriteLine("---------------------------------------------------------------")
            Console.WriteLine("The OCR module is an optional add-on, available for download")
            Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
            Console.WriteLine("module, ensure that the SDK is able to find the required files")
            Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
            Console.WriteLine("")
            Return
        End If

        ' Relative path to the folder containing test files.
        Dim input_path As String = "../../../../TestFiles/OCR/"
        Dim output_path As String = "../../../../TestFiles/Output/"

        '--------------------------------------------------------------------------------
        ' Example 1) Process image
        Try
            ' A) Setup empty destination doc.
            Using doc As PDFDoc = New PDFDoc()

                ' B) Set English as the language of choice
                Dim opts As OCROptions = New OCROptions()
                opts.AddLang("eng")

                ' C) Run OCR on the .png with options
                OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts)

                ' D) check the result
                doc.Save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveOptions.e_remove_unused)

                Console.WriteLine("Example 1: psychomachia_excerpt.png")

            End Using
        Catch e As PDFNetException
            Console.WriteLine(e.Message)
        End Try

        '--------------------------------------------------------------------------------
        ' Example 2) Process document using multiple languages
        Try
            ' A) Setup empty destination doc.
            Using doc As PDFDoc = New PDFDoc()

                ' B) Setup options with multiple target languages, English will always be considered as secondary language
                Dim opts As OCROptions = New OCROptions()
                opts.AddLang("rus")
                opts.AddLang("deu")


                ' C) Run OCR on the .jpg with options
                OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts)

                ' D) check the result
                doc.Save(output_path + "multi_lang.pdf", SDFDoc.SaveOptions.e_remove_unused)

                Console.WriteLine("Example 2: multi_lang.jpg")

            End Using
        Catch e As PDFNetException
            Console.WriteLine(e.Message)
        End Try


        '--------------------------------------------------------------------------------
        ' Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image 
        Try
            ' A) Open the .pdf document.
            Using doc As PDFDoc = New PDFDoc(input_path + "german_kids_song.pdf")

                ' B) Setup options with a single language and an ignore zone
                Dim opts As OCROptions = New OCROptions()
                opts.AddLang("deu")

                Dim zones As RectCollection = New RectCollection()
                zones.AddRect(1768, 680, 2056, 3044)
                opts.AddIgnoreZonesForPage(zones, 1)

                ' C) Run OCR on the .pdf with options
                OCRModule.ProcessPDF(doc, opts)

                ' D) check the result
                doc.Save(output_path + "german_kids_song.pdf", SDFDoc.SaveOptions.e_remove_unused)

                Console.WriteLine("Example 3: german_kids_song.pdf")

            End Using
        Catch e As PDFNetException
            Console.WriteLine(e.Message)
        End Try

        '--------------------------------------------------------------------------------
        ' Example 4) Process multipage tiff with text/ignore zones specified for each page
        Try
            ' A) Setup empty destination doc.
            Using doc As PDFDoc = New PDFDoc()

                ' B) Setup options with a single language plus text/ignore zones
                Dim opts As OCROptions = New OCROptions()
                opts.AddLang("eng")

                Dim zones As RectCollection = New RectCollection()

                ' ignore Signature box in the first 2 pages
                zones.AddRect(1492, 56, 2236, 432)
                opts.AddIgnoreZonesForPage(zones, 1)
                zones.Clear()
                
                zones.AddRect(1492, 56, 2236, 432)
                opts.AddIgnoreZonesForPage(zones, 2)
                zones.Clear()

                ' can use a combination of ignore And text boxes to focus on the page area of interest,
                ' as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
                zones.AddRect(992, 1276, 1368, 1372)
                opts.AddIgnoreZonesForPage(zones, 3)
                zones.Clear()
                ' we only have text zones selected in page 3


                ' select horizontal BUFFER ZONE sign
                zones.AddRect(900, 2384, 1236, 2480)
                ' select right vertical BUFFER ZONE sign
                zones.AddRect(1960, 1976, 2016, 2296)
                ' select Lot No.
                zones.AddRect(696, 1028, 1196, 1128)

                ' select part of the plan inside the BUFFER ZONE
                zones.AddRect(428, 1484, 1784, 2344)
                zones.AddRect(948, 1288, 1672, 1476)
                opts.AddIgnoreZonesForPage(zones, 3)

                ' C) Run OCR on the .pdf with options
                OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts)

                ' D) check the result
                doc.Save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveOptions.e_remove_unused)

                Console.WriteLine("Example 4: bc_environment_protection.tif")

            End Using
        Catch e As PDFNetException
            Console.WriteLine(e.Message)
        End Try

        '--------------------------------------------------------------------------------
        ' Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words Not in the dictionary Or filtering special
        ' out special characters), And finally applying modified OCR JSON to the source PDF document 
        Try
            ' A) Open the .pdf document.
            Using doc As PDFDoc = New PDFDoc(input_path + "zero_value_test_no_text.pdf")

                ' B) Set English as the language of choice
                Dim opts As OCROptions = New OCROptions()
                opts.AddLang("eng")

                ' C) Run OCR on the .pdf 
                Dim json As String = OCRModule.GetOCRJsonFromPDF(doc, opts)

                ' D) Post-processing step (whatever it might be), but we just print JSON here
                Console.WriteLine("Have OCR result JSON, re-applying to PDF")

                ' E) Apply potentially modified OCR JSON to the PDF
                OCRModule.ApplyOCRJsonToPDF(doc, json)

                ' F) check the result
                doc.Save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveOptions.e_remove_unused)

                Console.WriteLine("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf")

            End Using
        Catch e As PDFNetException
            Console.WriteLine(e.Message)
        End Try

        '--------------------------------------------------------------------------------
        ' Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
        Try
            ' A) Setup empty destination doc.
            Using doc As PDFDoc = New PDFDoc()

                ' B) Set English as the language of choice
                Dim opts As OCROptions = New OCROptions()
                opts.AddLang("eng")

                ' C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
                ' in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
                Dim xml As String = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts)

                ' D) Post-processing step (whatever it might be), but we just print XML here
                Console.WriteLine("Have OCR result XML, re-applying to PDF")

                ' E) Apply potentially modified OCR XML to the PDF
                OCRModule.ApplyOCRXmlToPDF(doc, xml)

                ' F) check the result
                doc.Save(output_path + "physics.pdf", SDFDoc.SaveOptions.e_remove_unused)

                Console.WriteLine("Example 6: extracting and applying OCR XML from physics.tif")

            End Using
        Catch e As PDFNetException
            Console.WriteLine(e.Message)
        End Try

    End Sub

End Module
close

Free Trial

Get unlimited trial usage of PDFTron SDK to bring accurate, reliable, and fast document processing capabilities to any application or workflow.

Select a platform to get started with your free trial.

Unlimited usage. No email address required.

PDFTron Receives USD$71 Million Growth Investment Led By Silversmith Capital Partners

Learn more
close