< Windows samples

TextExtractTest - VB

The sample illustrates the basic text extraction capabilities of PDFNet.

'
' Copyright (c) 2001-2018 by PDFTron Systems Inc. All Rights Reserved.
'

Imports System

Imports pdftron
Imports pdftron.Common
Imports pdftron.Filters
Imports pdftron.SDF
Imports PDFTRON.PDF

' This sample illustrates various text extraction capabilities of PDFNet.

Module TextExtractTestVB
	Dim pdfNetLoader As PDFNetLoader = pdftron.PDFNetLoader.Instance()

	Sub Main()

		PDFNet.Initialize()

		' Relative path to the folder containing test files.
		Dim input_path As String = "../../../../TestFiles/"

		Dim example1_basic As Boolean = True
		Dim example2_xml As Boolean = True
		Dim example3_wordlist As Boolean = True
		Dim example4_advanced As Boolean = False
		Dim example5_low_level As Boolean = False

		' Sample code showing how to use high-level text extraction APIs.
		Try
			Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
				doc.InitSecurityHandler()

				Dim pg As Page = doc.GetPage(1)
				If pg Is Nothing Then
					Console.WriteLine("Page not found.")
					Return
				End If

				Using txt As TextExtractor = New TextExtractor
					txt.Begin(pg)	 ' Read the page.
					' Other options you may want to consider...
					' txt.Begin(page, Nothing, TextExtractor.ProcessingFlags.e_no_dup_remove)
					' txt.Begin(page, Nothing, TextExtractor.ProcessingFlags.e_remove_hidden_text)
					' ...

					' Example 1. Get all text on the page in a single string.
					' Words will be separated with space or new line characters.
					If example1_basic Then
						' Get the word count.
						Console.WriteLine("Word Count: {0}", txt.GetWordCount())

						Console.WriteLine("")
						Console.WriteLine("- GetAsText --------------------------")
						Console.WriteLine(txt.GetAsText())
						Console.WriteLine("-----------------------------------------------------------")
					End If


					' Example 2. Get XML logical structure for the page.
					If example2_xml Then
						Console.WriteLine("")
						Console.WriteLine("- GetAsXML  --------------------------")
						Console.WriteLine(txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements Or TextExtractor.XMLOutputFlags.e_output_bbox Or TextExtractor.XMLOutputFlags.e_output_style_info))
						Console.WriteLine("-----------------------------------------------------------")
					End If


					If example3_wordlist Then
						Dim word As TextExtractor.Word
						Dim line As TextExtractor.Line = txt.GetFirstLine()
						While line.IsValid()
							word = line.GetFirstWord()
							While word.IsValid()
								Console.WriteLine(word.GetString())
								word = word.GetNextWord()
							End While
							line = line.GetNextLine()
						End While
						Console.WriteLine("-----------------------------------------------------------")
					End If


					' Example 3. A more advanced text extraction example. 
					' The output is XML structure containing paragraphs, lines, words, 
					' as well as style and positioning information.
					If example4_advanced Then
						Dim bbox As Rect
						Dim cur_flow_id As Integer = -1
						Dim cur_para_id As Integer = -1

						Dim line As TextExtractor.Line
						Dim word As TextExtractor.Word
						Dim s As TextExtractor.Style
						Dim line_style As TextExtractor.Style

						' For each line on the page...
						line = txt.GetFirstLine()

						While line.IsValid()
							If Not cur_flow_id = line.GetFlowID() Then
								If Not cur_flow_id = -1 Then
									If Not cur_para_id = -1 Then
										cur_para_id = -1
										Console.WriteLine("</Para>")
									End If
									Console.WriteLine("</Flow>")
								End If
								cur_flow_id = line.GetFlowID()
								Console.WriteLine("<Flow id={0}>", cur_flow_id)
							End If

							If Not cur_para_id = line.GetParagraphID() Then
								If Not cur_para_id = -1 Then
									Console.WriteLine("</Para>")
								End If
								cur_para_id = line.GetParagraphID()
								Console.WriteLine("<Para id={0}>", cur_para_id)
							End If

							bbox = line.GetBBox()
							line_style = line.GetStyle()
							Console.Write("<Line box=""{0}, {1}, {2}, {3}""", bbox.x1, bbox.y1, bbox.x2, bbox.y2)
							PrintStyle(line_style)
							Console.WriteLine("")

							' For each word in the line...
							word = line.GetFirstWord()
							While word.IsValid()
								' Output the bounding box for the word.
								bbox = word.GetBBox()
								Console.Write("<Word box=""{0}, {1}, {2}, {3}""", bbox.x1, bbox.y1, bbox.x2, bbox.y2)

								' If the word style is different from the parent style, output the new style.
								s = word.GetStyle()
								If Not s.Equals(line_style) Then
									PrintStyle(s)
								End If

								Console.WriteLine(">")
								Console.Write(word.GetString())
								Console.WriteLine("</Word>")
								word = word.GetNextWord()
							End While

							Console.WriteLine("</Line>")
							line = line.GetNextLine()
						End While

						If Not cur_flow_id = -1 Then
							If Not cur_para_id = -1 Then
								cur_para_id = -1
								Console.WriteLine("</Para>")
							End If
							Console.WriteLine("</Flow>")
						End If
					End If

					Console.WriteLine("Done.")
				End Using
			End Using
        Catch ex As PDFNetException
			Console.WriteLine(ex.Message)
		Catch ex As Exception
			MsgBox(ex.Message)
		End Try



		' Sample code showing how to use low-level text extraction APIs.
		If (example5_low_level) Then

			Try
				' Open the test file
				Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
					doc.InitSecurityHandler()

					Using reader As ElementReader = New ElementReader

						' Example 1. Extract all text content from the document
						Dim itr As PageIterator = doc.GetPageIterator()
						' While itr.HasNext()
						reader.Begin(itr.Current())
						DumpAllText(reader)
						reader.End()
						'   itr.Next()
						' End While

						' Example 2. Extract text based on the selection rectangle.
						Console.WriteLine("----------------------------------------------------")
						Console.WriteLine("Extract text based on the selection rectangle.")
						Console.WriteLine("----------------------------------------------------")

						Dim first_page As Page = doc.GetPage(1)
						Dim field1 As String = ReadTextFromRect(first_page, New Rect(27, 392, 563, 534), reader)
						Dim field2 As String = ReadTextFromRect(first_page, New Rect(28, 551, 106, 623), reader)
						Dim field3 As String = ReadTextFromRect(first_page, New Rect(208, 550, 387, 621), reader)

						Console.WriteLine("Field 1: {0}", field1)
						Console.WriteLine("Field 2: {0}", field2)
						Console.WriteLine("Field 3: {0}", field3)
						' ... 

						Console.WriteLine("Done.")
					End Using
				End Using

			Catch ex As PDFNetException
				Console.WriteLine(ex.Message)
			Catch ex As Exception
				MsgBox(ex.Message)
			End Try
		End If
	End Sub


	Sub PrintStyle(ByRef s As TextExtractor.Style)
		Console.Write(""" style=font-family: {0}; font-size: {1};""", s.GetFontName(), s.GetFontSize())
		If s.IsSerif() Then
			Console.Write(" sans-serif;")
		End If
	End Sub

	' LowLevelTextExtractUtils ----------------------------------------

	Sub DumpAllText(ByRef reader As ElementReader)
		Dim element As Element = reader.Next()
		While (Not IsNothing(element))		 ' Read page contents
			Dim type As Element.Type = element.GetType()

			If type = element.Type.e_text_begin Then
				Console.WriteLine()
				Console.WriteLine("--> Text Block Begin")
			ElseIf type = element.Type.e_text_end Then
				Console.WriteLine()
				Console.WriteLine("--> Text Block End")
			ElseIf type = element.Type.e_text Then
				Dim bbox As Rect = New Rect
				element.GetBBox(bbox)
				' Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2)

                Dim txt As String = element.GetTextString()
                Console.WriteLine(txt)
			ElseIf type = element.Type.e_text_new_line Then
				' Console.WriteLine()
				' Console.WriteLine("--> New Line")
			ElseIf type = element.Type.e_form Then
				reader.FormBegin()				' Process form XObjects
				DumpAllText(reader)
				reader.End()
			End If

			element = reader.Next()
		End While
	End Sub

	Private _srch_str As String

	' A helper method for ReadTextFromRect
	Sub RectTextSearch(ByRef reader As ElementReader, ByRef pos As Rect)
		Dim element As Element = reader.Next()
		While (Not IsNothing(element))		 ' Read page contents
			Dim type As Element.Type = element.GetType()

			If type = element.Type.e_text Then
				Dim bbox As Rect = New Rect
				element.GetBBox(bbox)

				If (bbox.IntersectRect(bbox, pos)) Then
                    Dim txt As String = element.GetTextString()
                    _srch_str = _srch_str + txt
                End If
			ElseIf type = element.Type.e_text_new_line Then
			ElseIf type = element.Type.e_form Then
				reader.FormBegin()				   ' Process form XObjects
				RectTextSearch(reader, pos)
				reader.End()
			End If

			element = reader.Next()
		End While
	End Sub


	' A utility method used to extract all text content from
    ' a given selection rectangle. The rectangle coordinates are
	' expressed in PDF user/page coordinate system.
	Function ReadTextFromRect(ByRef page As Page, ByRef pos As Rect, ByRef reader As ElementReader) As String
		_srch_str = ""
		reader.Begin(page)
		RectTextSearch(reader, pos)
		reader.End()
		Return _srch_str
	End Function

End Module