Some test text!

menu

PDF logical structure reader in VB

More languages

chevron_right
More languages
JavaScript
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
VB
C# (Xamarin)

Sample VB code for using PDFTron SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our VB PDF Library and PDF Parsing & Content Extraction Library.

Get StartedSamplesDownload

To run this sample, get started with a free trial of PDFTron SDK.

'
' Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
'

Imports System
Imports System.Collections
Imports pdftron
Imports pdftron.Common
Imports pdftron.Filters
Imports pdftron.SDF
Imports pdftron.PDF
Imports pdftron.PDF.Struct

Module LogicalStructureTestCS
    Dim pdfNetLoader As PDFNetLoader
    Sub New()
        pdfNetLoader = pdftron.PDFNetLoader.Instance()
    End Sub

    Sub PrintIndent(ByVal indent As Integer)
        Console.WriteLine()

        For i As Integer = 0 To indent - 1
            Console.Write("  ")
        Next
    End Sub

    Sub ProcessStructElement(ByVal element As SElement, ByVal indent As Integer)
        If Not element.IsValid() Then
            Return
        End If

        PrintIndent(Math.Min(System.Threading.Interlocked.Increment(indent), indent - 1))
        Console.Write("Type: " & element.[GetType]())

        If element.HasTitle() Then
            Console.Write(". Title: " & element.GetTitle())
        End If

        Dim num As Integer = element.GetNumKids()

        For i As Integer = 0 To num - 1

            If element.IsContentItem(i) Then
                Dim cont As ContentItem = element.GetAsContentItem(i)
                Dim type As ContentItem.Type = cont.[GetType]()
                Dim page As Page = cont.GetPage()
                PrintIndent(indent)
                Console.Write("Content Item. Part of page #" & page.GetIndex())
                PrintIndent(indent)

                Select Case type
                    Case ContentItem.Type.e_MCID, ContentItem.Type.e_MCR
                        Console.Write("MCID: " & cont.GetMCID())
                    Case ContentItem.Type.e_OBJR
                        Console.Write("OBJR ")
                        Dim ref_obj As Obj = cont.GetRefObj()
                        If ref_obj IsNot Nothing Then Console.Write("- Referenced Object#: " & ref_obj.GetObjNum())
                    Case Else
                End Select
            Else
                ProcessStructElement(element.GetAsStructElem(i), indent)
            End If
        Next
    End Sub

    Sub ProcessElements(ByVal reader As ElementReader)
        Dim element As Element = reader.Next()
        While Not IsNothing(element)  ' Read page contents
            Dim type As Element.Type = element.[GetType]()

            If type = element.Type.e_path OrElse type = element.Type.e_text OrElse type = element.Type.e_path Then

                Select Case type
                    Case element.Type.e_path
                        Console.WriteLine()
                        Console.Write("PATH: ")
                    Case element.Type.e_text
                        Console.WriteLine()
                        Console.WriteLine("TEXT: " & element.GetTextString())
                    Case element.Type.e_form
                        Console.WriteLine()
                        Console.Write("FORM XObject: ")
                End Select

                Dim struct_parent As SElement = element.GetParentStructElement()

                If struct_parent.IsValid() Then
                    Console.Write(" Type: " & struct_parent.[GetType]() & ", MCID: " + String.Format("{0}", element.GetStructMCID()))

                    If struct_parent.HasTitle() Then
                        Console.Write(". Title: " & struct_parent.GetTitle())
                    End If

                    Console.Write(", Obj#: " & struct_parent.GetSDFObj().GetObjNum())
                End If
            End If
            element = reader.Next()
        End While
    End Sub

    Sub ProcessElements2(ByVal reader As ElementReader, ByVal mcid_page_map As Hashtable)
        Dim element As Element = reader.Next()
        While Not IsNothing(element)  ' Read page contents
            Dim mcid As Integer = element.GetStructMCID()

            If mcid >= 0 AndAlso element.[GetType]() = element.Type.e_text Then
                Dim val As String = element.GetTextString()

                If mcid_page_map.ContainsKey(mcid) Then
                    mcid_page_map(mcid) = (CStr((mcid_page_map(mcid))) & val)
                Else
                    mcid_page_map.Add(mcid, val)
                End If
            End If
            element = reader.Next()
        End While
    End Sub

    Sub ProcessStructElement2(ByVal element As SElement, ByVal mcid_doc_map As Hashtable, ByVal indent As Integer)
        If Not element.IsValid() Then
            Return
        End If

        PrintIndent(indent)
        Console.Write("<" & element.[GetType]())

        If element.HasTitle() Then
            Console.Write(" title=""" & element.GetTitle() & """")
        End If

        Console.Write(">")
        Dim num As Integer = element.GetNumKids()

        For i As Integer = 0 To num - 1

            If element.IsContentItem(i) Then
                Dim cont As ContentItem = element.GetAsContentItem(i)

                If cont.[GetType]() = ContentItem.Type.e_MCID Then
                    Dim page_num As Integer = cont.GetPage().GetIndex()

                    If mcid_doc_map.ContainsKey(page_num) Then
                        Dim mcid_page_map As Hashtable = CType((mcid_doc_map(page_num)), Hashtable)
                        Dim mcid As Integer = cont.GetMCID()

                        If mcid_page_map.ContainsKey(mcid) Then
                            Console.Write(mcid_page_map(mcid))
                        End If
                    End If
                End If
            Else
                ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent + 1)
            End If
        Next

        PrintIndent(indent)
        Console.Write("</" & element.[GetType]() & ">")
    End Sub


    Sub Main(ByVal args As String())
        PDFNet.Initialize()
        Dim input_path As String = "../../../../TestFiles/"
        Dim output_path As String = "../../../../TestFiles/Output/"

        Try

            Using doc As PDFDoc = New PDFDoc(input_path & "tagged.pdf")
                doc.InitSecurityHandler()
                Dim example1 As Boolean = True
                Dim example2 As Boolean = True
                Dim example3 As Boolean = True

                If example1 Then
                    Console.WriteLine("____________________________________________________________")
                    Console.WriteLine("Sample 1 - Traverse logical structure tree...")
                    Dim tree As STree = doc.GetStructTree()

                    If tree.IsValid() Then
                        Console.WriteLine("Document has a StructTree root.")

                        For i As Integer = 0 To tree.GetNumKids() - 1
                            ProcessStructElement(tree.GetKid(i), 0)
                        Next
                    Else
                        Console.WriteLine("This document does not contain any logical structure.")
                    End If

                    Console.WriteLine()
                    Console.WriteLine("Done 1.")
                End If

                If example2 Then
                    Console.WriteLine("____________________________________________________________")
                    Console.WriteLine("Sample 2 - Get parent logical structure elements from")
                    Console.WriteLine("layout elements.")
                    Dim reader As ElementReader = New ElementReader()
                    Dim itr As PageIterator = doc.GetPageIterator()

                    While itr.HasNext()
                        reader.Begin(itr.Current())
                        ProcessElements(reader)
                        reader.[End]()
                        itr.[Next]()
                    End While

                    Console.WriteLine()
                    Console.WriteLine("Done 2.")
                End If

                If example3 Then
                    Console.WriteLine("____________________________________________________________")
                    Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
                    Dim mcid_doc_map As Hashtable = New Hashtable()
                    Dim reader As ElementReader = New ElementReader()
                    Dim itr As PageIterator = doc.GetPageIterator()

                    While itr.HasNext()
                        Dim pg As Page = itr.Current()
                        reader.Begin(pg)
                        Dim page_mcid_map As Hashtable = New Hashtable()
                        mcid_doc_map.Add(pg.GetIndex(), page_mcid_map)
                        ProcessElements2(reader, page_mcid_map)
                        reader.[End]()
                        itr.[Next]()
                    End While

                    Dim tree As STree = doc.GetStructTree()

                    If tree.IsValid() Then

                        For i As Integer = 0 To tree.GetNumKids() - 1
                            ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
                        Next
                    End If

                    Console.WriteLine()
                    Console.WriteLine("Done 3.")
                End If

                doc.Save(output_path & "LogicalStructure.pdf", 0)
            End Using

        Catch e As PDFNetException
            Console.WriteLine(e.Message)
        End Try
    End Sub

End Module
close

Free Trial

Get unlimited trial usage of PDFTron SDK to bring accurate, reliable, and fast document processing capabilities to any application or workflow.

Select a platform to get started with your free trial.

Unlimited usage. No email address required.