Some test text!


PDF data extraction in VB (images, text, paths)

More languages

More languages
Java (Android)
C# (.NET Core)
JS (Node.js)
C# (UWP)
C# (Xamarin)

Sample VB code for using PDFTron SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our VB PDF Library and PDF Parsing & Content Extraction Library .

Get Started Samples Download

To run this sample, get started with a free trial of PDFTron SDK.

' Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
' A sample project illustrating some extraction capabilities of ElementReader
' in more detail

Imports System

Imports pdftron
Imports pdftron.Common
Imports pdftron.Filters
Imports pdftron.SDF
Imports pdftron.PDF

Module ElementReaderAdvTestVB
	Dim pdfNetLoader As PDFNetLoader
	Sub New()
		pdfNetLoader = pdftron.PDFNetLoader.Instance()
	End Sub

	Dim m_buf As String

	Sub ProcessPath(ByRef reader As ElementReader, ByRef path As Element)
		If path.IsClippingPath() Then
			Console.WriteLine("This is a clipping path")
		End If

        Dim pathData As PathData = path.GetPathData()
        Dim data As Double() = pathData.points
        Dim data_sz As Integer = data.Length

        Dim opr As Byte() = pathData.operators
        Dim opr_sz As Integer = opr.Length

		Dim opr_itr As Integer = 0
		Dim opr_end As Integer = opr_sz
		Dim data_itr As Integer = 0
		Dim data_end As Integer = data_sz
		Dim x1, y1, x2, y2, x3, y3 As Double

		' Use path.GetCTM() if you are interested in CTM (current transformation matrix).

		Console.Write(" Path Data Points := \")
		While opr_itr < opr_end
            If opr(opr_itr) = pathData.PathSegmentType.e_moveto Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                m_buf = String.Format("M{0:g5} {1:g5}", x1, y1)
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_lineto Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                m_buf = String.Format(" L{0:g5} {1:g5}", x1, y1)
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_cubicto Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                x2 = data(data_itr)
                data_itr += 1
                y2 = data(data_itr)
                data_itr += 1
                x3 = data(data_itr)
                data_itr += 1
                y3 = data(data_itr)
                data_itr += 1
                Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3}
                m_buf = String.Format(" C{0:g5} {1:g5} {2:g5} {3:g5} {4:g5} {5:g5}", _
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_rect Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                Dim w As Double = data(data_itr)
                data_itr += 1
                Dim h As Double = data(data_itr)
                data_itr += 1
                x2 = x1 + w
                y2 = y1
                x3 = x2
                y3 = y1 + h
                Dim x4 As Double = x1
                Dim y4 As Double = y3
                Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3, x4, y4}
                m_buf = String.Format("M{0:g5} {1:g5} L{2:g5} {3:g5} L{4:g5} {5:g5} L{6:g5} {7:g5} Z", _
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_closepath Then
                Console.WriteLine(" Close Path")
            End If

			opr_itr += 1
		End While

		Console.Write(""" ")

		Dim gs As GState = path.GetGState()

		' Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
		If path.IsStroked() Then
			Console.WriteLine("Stroke path")
			If gs.GetStrokeColorSpace().GetType() = ColorSpace.Type.e_pattern Then
				Console.WriteLine("Path has associated pattern")
				' Get stroke color (you can use PDFNet color conversion facilities)
				' Dim rgb As ColorPt
				' gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb)
			End If
			' Do not stroke path
		End If

		If path.IsFilled() Then
			Console.WriteLine("Fill path")

			If gs.GetFillColorSpace().GetType() = ColorSpace.Type.e_pattern Then
				Console.WriteLine("Path has associated pattern")
				' Dim rgb As ColorPt
				' gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb)
			End If
			' Do not fill path
		End If

		' Process any changes in graphics state  ---------------------------------
        Dim gs_itr As GSChangesIterator = reader.GetChangesIterator()
        While gs_itr.HasNext()
            If gs_itr.Current() = GState.GStateAttribute.e_transform Then
                ' Get transform matrix for this element. Unlike path.GetCTM() 
                ' that return full transformation matrix gs.GetTransform() return 
                ' only the transformation matrix that was installed for this element.
                ' gs.GetTransform()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_width Then
                ' gs.GetLineWidth()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_cap Then
                ' gs.GetLineCap()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_join Then
                ' gs.GetLineJoin()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_flatness Then
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_miter_limit Then
                ' gs.GetMiterLimit()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_dash_pattern Then
                ' Dim dashes As Double()
                ' gs.GetDashes(dashes)
                ' gs.GetPhase()
            End If

        End While
	End Sub

	Sub ProcessText(ByRef page_reader As ElementReader)
		' Begin text element
		Console.WriteLine("Begin Text Block:")

		Dim element As Element
		element = page_reader.Next()
		While Not IsNothing(element)
			If element.GetType() = element.Type.e_text_end Then
				' Finish the text block
				Console.WriteLine("End Text Block.")
			ElseIf element.GetType() = element.Type.e_text Then
				Dim gs As GState = element.GetGState()

				Dim cs_fill As ColorSpace = gs.GetFillColorSpace()
				Dim fill As ColorPt = gs.GetFillColor()

				Dim outc As ColorPt = New ColorPt
				cs_fill.Convert2RGB(fill, outc)

				Dim cs_stroke As ColorSpace = gs.GetStrokeColorSpace()
				Dim stroke As ColorPt = gs.GetStrokeColor()

				Dim font As Font = gs.GetFont()

				Console.Write("Font Name: ")
				' font.IsFixedWidth()
				' font.IsSerif()
				' font.IsSymbolic()
				' font.IsItalic()
				' ... 

				' Dim word_spacing As Double = gs.GetWordSpacing()
				' Dim char_spacing As Double = gs.GetCharSpacing()

				' Use element.GetCTM() if you are interested in the CTM 
				' (current transformation matrix).
				Dim ctm As Matrix2D = element.GetCTM()

				Dim text_mtx As Matrix2D = element.GetTextMatrix()

				Dim mtx As Matrix2D = New Matrix2D
				mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
				Dim font_sz_scale_factor As Double = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d)
				Dim font_size As Double = gs.GetFontSize()
				Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size)

				Dim font_color As ColorPt = gs.GetFillColor()
				Dim cs As ColorSpace = gs.GetFillColorSpace()

				Dim rgb As ColorPt = New ColorPt
				cs.Convert2RGB(font_color, rgb)

				Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", _
					CByte(rgb.Get(0) * 255), CByte(rgb.Get(1) * 255), CByte(rgb.Get(2) * 255))

				Dim x, y As Double
				Dim char_code As Integer

                Dim itr As CharIterator = element.GetCharIterator()
                While itr.HasNext()
                    Console.Write("Character code: ")
                    char_code = itr.Current().char_code

                    x = itr.Current().x      ' character positioning information
                    y = itr.Current().y

                    ' To get the exact character positioning information you need to 
                    ' concatenate current text matrix with CTM and then multiply 
                    ' relative positioning coordinates with the resulting matrix.
                    mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
                    mtx.Mult(x, y)
                    Console.WriteLine(" Position: x={0:f} y={1:f}", x, y)
                End While

			End If
			element = page_reader.Next()
		End While
	End Sub

	Sub ProcessImage(ByRef image As Element)
		Dim image_mask As Boolean = image.IsImageMask()
		Dim interpolate As Boolean = image.IsImageInterpolate()
		Dim width As Integer = image.GetImageWidth()
		Dim height As Integer = image.GetImageHeight()
		Dim out_data_sz As Integer = width * height * 3

		Console.WriteLine("Image: width=""{0:d}"" height=""{1:d}""", width, height)

		' Dim mtx As Matrix2D = image.GetCTM() ' image matrix (page positioning info)

		' You can use GetImageData to read the raw (decoded) image data
		'image.GetImageData()	' get raw image data
		' .... or use Image2RGB filter that converts every image to RGB format,
		' This should save you time since you don't need to deal with color conversions, 
		' image up-sampling, decoding etc.

		Dim img_conv As Image2RGB = New Image2RGB(image)	   ' Extract and convert image to RGB 8-bpc format
		Dim reader As FilterReader = New FilterReader(img_conv)

		' A buffer used to keep image data.
        Dim image_data_out As Byte() = Nothing       '= New Byte(out_data_sz)

		' image_data_out contains RGB image data.

		' Note that you don't need to read a whole image at a time. Alternatively
		' you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
		' until the function returns 0. 
	End Sub

	Sub ProcessElements(ByRef reader As ElementReader)
		Dim element As Element = reader.Next()

		element = reader.Next()
		While Not IsNothing(element)		 ' Read page contents
			If element.GetType() = element.Type.e_path Then
				' Process path data...
				ProcessPath(reader, element)
			ElseIf element.GetType() = element.Type.e_text_begin Then
				' Process text strings...
			ElseIf element.GetType() = element.Type.e_form Then
				' Process form XObjects
			ElseIf element.GetType() = element.Type.e_image Then
				' Process Images
			End If
			element = reader.Next()
		End While
	End Sub

	Sub Main()


		' Relative path to the folder containing test files.
		Dim input_path As String = "../../../../TestFiles/"
		' Dim output_path As String = "../../../../TestFiles/Output/"

		Console.WriteLine("Extract page element information from all")
		Console.WriteLine("pages in the document.")

		' Open the test file
		Console.WriteLine("Opening the input file...")
		Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")

			Dim pgnum As Integer = doc.GetPageCount()

			Dim itr As PageIterator
			Using page_reader As ElementReader = New ElementReader
				itr = doc.GetPageIterator()
				While itr.HasNext()	'  Read every page
					Console.WriteLine("Page {0:d} ----------------------------------------", _

					Dim crop_box As Rect = itr.Current().GetCropBox()
					Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2)
					Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height())

				End While
			End Using
		End Using

	End Sub

End Module

Free Trial

Get unlimited trial usage of PDFTron SDK to bring accurate, reliable, and fast document processing capabilities to any application or workflow.

Select a platform to get started with your free trial.

Unlimited usage. No email address required.