Some test text!

< Windows samples

PDF Data Extraction in VB (Images, Text, Paths)

Sample VB code for using PDFTron SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our PDF Parsing & Content Extraction Library.

Step 1: Get your free trial license key, or sign in

Start Trial
Sign in

Step 2: Add the code:

'
' Copyright (c) 2001-2018 by PDFTron Systems Inc. All Rights Reserved.
'
' A sample project illustrating some extraction capabilities of ElementReader
' in more detail
'

Imports System

Imports pdftron
Imports pdftron.Common
Imports pdftron.Filters
Imports pdftron.SDF
Imports pdftron.PDF

Module ElementReaderAdvTestVB
	Dim pdfNetLoader As PDFNetLoader
	Sub New()
		pdfNetLoader = pdftron.PDFNetLoader.Instance()
	End Sub

	Dim m_buf As String

	Sub ProcessPath(ByRef reader As ElementReader, ByRef path As Element)
		If path.IsClippingPath() Then
			Console.WriteLine("This is a clipping path")
		End If

        Dim pathData As PathData = path.GetPathData()
        Dim data As Double() = pathData.points
        Dim data_sz As Integer = data.Length

        Dim opr As Byte() = pathData.operators
        Dim opr_sz As Integer = opr.Length

		Dim opr_itr As Integer = 0
		Dim opr_end As Integer = opr_sz
		Dim data_itr As Integer = 0
		Dim data_end As Integer = data_sz
		Dim x1, y1, x2, y2, x3, y3 As Double

		' Use path.GetCTM() if you are interested in CTM (current transformation matrix).

		Console.Write(" Path Data Points := \")
		While opr_itr < opr_end
			'switch((Element.PathSegmentType)((int)opr[opr_itr]))
            If opr(opr_itr) = pathData.PathSegmentType.e_moveto Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                m_buf = String.Format("M{0:g5} {1:g5}", x1, y1)
                Console.Write(m_buf)
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_lineto Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                m_buf = String.Format(" L{0:g5} {1:g5}", x1, y1)
                Console.Write(m_buf)
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_cubicto Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                x2 = data(data_itr)
                data_itr += 1
                y2 = data(data_itr)
                data_itr += 1
                x3 = data(data_itr)
                data_itr += 1
                y3 = data(data_itr)
                data_itr += 1
                Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3}
                m_buf = String.Format(" C{0:g5} {1:g5} {2:g5} {3:g5} {4:g5} {5:g5}", _
                 coords)
                Console.Write(m_buf)
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_rect Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                Dim w As Double = data(data_itr)
                data_itr += 1
                Dim h As Double = data(data_itr)
                data_itr += 1
                x2 = x1 + w
                y2 = y1
                x3 = x2
                y3 = y1 + h
                Dim x4 As Double = x1
                Dim y4 As Double = y3
                Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3, x4, y4}
                m_buf = String.Format("M{0:g5} {1:g5} L{2:g5} {3:g5} L{4:g5} {5:g5} L{6:g5} {7:g5} Z", _
                 coords)
                Console.Write(m_buf)
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_closepath Then
                Console.WriteLine(" Close Path")
            Else
                System.Diagnostics.Debug.Assert(False)
            End If

			opr_itr += 1
		End While

		Console.Write(""" ")

		Dim gs As GState = path.GetGState()

		' Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
		If path.IsStroked() Then
			Console.WriteLine("Stroke path")
			If gs.GetStrokeColorSpace().GetType() = ColorSpace.Type.e_pattern Then
				Console.WriteLine("Path has associated pattern")
			Else
				' Get stroke color (you can use PDFNet color conversion facilities)
				' Dim rgb As ColorPt
				' gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb)
			End If
		Else
			' Do not stroke path
		End If

		If path.IsFilled() Then
			Console.WriteLine("Fill path")

			If gs.GetFillColorSpace().GetType() = ColorSpace.Type.e_pattern Then
				Console.WriteLine("Path has associated pattern")
			Else
				' Dim rgb As ColorPt
				' gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb)
			End If
		Else
			' Do not fill path
		End If

		' Process any changes in graphics state  ---------------------------------
        Dim gs_itr As GSChangesIterator = reader.GetChangesIterator()
        While gs_itr.HasNext()
            If gs_itr.Current() = GState.GStateAttribute.e_transform Then
                ' Get transform matrix for this element. Unlike path.GetCTM() 
                ' that return full transformation matrix gs.GetTransform() return 
                ' only the transformation matrix that was installed for this element.
                '
                ' gs.GetTransform()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_width Then
                ' gs.GetLineWidth()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_cap Then
                ' gs.GetLineCap()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_join Then
                ' gs.GetLineJoin()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_flatness Then
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_miter_limit Then
                ' gs.GetMiterLimit()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_dash_pattern Then
                ' Dim dashes As Double()
                ' gs.GetDashes(dashes)
                ' gs.GetPhase()
            End If

            gs_itr.Next()
        End While
	End Sub

	Sub ProcessText(ByRef page_reader As ElementReader)
		' Begin text element
		Console.WriteLine("Begin Text Block:")

		Dim element As Element
		element = page_reader.Next()
		While Not IsNothing(element)
			If element.GetType() = element.Type.e_text_end Then
				' Finish the text block
				Console.WriteLine("End Text Block.")
				Return
			ElseIf element.GetType() = element.Type.e_text Then
				Dim gs As GState = element.GetGState()

				Dim cs_fill As ColorSpace = gs.GetFillColorSpace()
				Dim fill As ColorPt = gs.GetFillColor()

				Dim outc As ColorPt = New ColorPt
				cs_fill.Convert2RGB(fill, outc)

				Dim cs_stroke As ColorSpace = gs.GetStrokeColorSpace()
				Dim stroke As ColorPt = gs.GetStrokeColor()

				Dim font As Font = gs.GetFont()

				Console.Write("Font Name: ")
				Console.Write(font.GetName())
				' font.IsFixedWidth()
				' font.IsSerif()
				' font.IsSymbolic()
				' font.IsItalic()
				' ... 

				' Dim word_spacing As Double = gs.GetWordSpacing()
				' Dim char_spacing As Double = gs.GetCharSpacing()

				' Use element.GetCTM() if you are interested in the CTM 
				' (current transformation matrix).
				Dim ctm As Matrix2D = element.GetCTM()

				Dim text_mtx As Matrix2D = element.GetTextMatrix()

				Dim mtx As Matrix2D = New Matrix2D
				mtx.Set(ctm)
				mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
				Dim font_sz_scale_factor As Double = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d)
				Dim font_size As Double = gs.GetFontSize()
				Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size)

				Dim font_color As ColorPt = gs.GetFillColor()
				Dim cs As ColorSpace = gs.GetFillColorSpace()

				Dim rgb As ColorPt = New ColorPt
				cs.Convert2RGB(font_color, rgb)

				Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", _
					CByte(rgb.Get(0) * 255), CByte(rgb.Get(1) * 255), CByte(rgb.Get(2) * 255))

				Dim x, y As Double
				Dim char_code As Integer

                Dim itr As CharIterator = element.GetCharIterator()
                While itr.HasNext()
                    Console.Write("Character code: ")
                    char_code = itr.Current().char_code
                    Console.Write(Chr(char_code))

                    x = itr.Current().x      ' character positioning information
                    y = itr.Current().y

                    ' To get the exact character positioning information you need to 
                    ' concatenate current text matrix with CTM and then multiply 
                    ' relative positioning coordinates with the resulting matrix.
                    '
                    mtx.Set(ctm)
                    mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
                    mtx.Mult(x, y)
                    Console.WriteLine(" Position: x={0:f} y={1:f}", x, y)
                    itr.Next()
                End While

                Console.WriteLine()
			End If
			element = page_reader.Next()
		End While
	End Sub

	Sub ProcessImage(ByRef image As Element)
		Dim image_mask As Boolean = image.IsImageMask()
		Dim interpolate As Boolean = image.IsImageInterpolate()
		Dim width As Integer = image.GetImageWidth()
		Dim height As Integer = image.GetImageHeight()
		Dim out_data_sz As Integer = width * height * 3

		Console.WriteLine("Image: width=""{0:d}"" height=""{1:d}""", width, height)

		' Dim mtx As Matrix2D = image.GetCTM() ' image matrix (page positioning info)

		' You can use GetImageData to read the raw (decoded) image data
		'image.GetBitsPerComponent()	
		'image.GetImageData()	' get raw image data
		' .... or use Image2RGB filter that converts every image to RGB format,
		' This should save you time since you don't need to deal with color conversions, 
		' image up-sampling, decoding etc.

		Dim img_conv As Image2RGB = New Image2RGB(image)	   ' Extract and convert image to RGB 8-bpc format
		Dim reader As FilterReader = New FilterReader(img_conv)

		' A buffer used to keep image data.
        Dim image_data_out As Byte() = Nothing       '= New Byte(out_data_sz)

		reader.Read(image_data_out)
		' image_data_out contains RGB image data.

		' Note that you don't need to read a whole image at a time. Alternatively
		' you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
		' until the function returns 0. 
	End Sub

	Sub ProcessElements(ByRef reader As ElementReader)
		Dim element As Element = reader.Next()

		element = reader.Next()
		While Not IsNothing(element)		 ' Read page contents
			If element.GetType() = element.Type.e_path Then
				' Process path data...
				ProcessPath(reader, element)
			ElseIf element.GetType() = element.Type.e_text_begin Then
				' Process text strings...
				ProcessText(reader)
			ElseIf element.GetType() = element.Type.e_form Then
				' Process form XObjects
				reader.FormBegin()
				ProcessElements(reader)
				reader.End()
			ElseIf element.GetType() = element.Type.e_image Then
				' Process Images
				ProcessImage(element)
			End If
			element = reader.Next()
		End While
	End Sub

	Sub Main()

		PDFNet.Initialize()

		' Relative path to the folder containing test files.
		Dim input_path As String = "../../../../TestFiles/"
		' Dim output_path As String = "../../../../TestFiles/Output/"

		Console.WriteLine("-------------------------------------------------")
		Console.WriteLine("Extract page element information from all")
		Console.WriteLine("pages in the document.")

		' Open the test file
		Console.WriteLine("Opening the input file...")
		Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
			doc.InitSecurityHandler()

			Dim pgnum As Integer = doc.GetPageCount()

			Dim itr As PageIterator
			Using page_reader As ElementReader = New ElementReader
				itr = doc.GetPageIterator()
				While itr.HasNext()	'  Read every page
					Console.WriteLine("Page {0:d} ----------------------------------------", _
					 itr.GetPageNumber())

					Dim crop_box As Rect = itr.Current().GetCropBox()
					Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2)
					Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height())

					page_reader.Begin(itr.Current())
					ProcessElements(page_reader)
					page_reader.End()
					itr.Next()
				End While
			End Using
		End Using
		Console.WriteLine("Done.")

	End Sub

End Module