Some test text!

Search
Hamburger Icon

PDF logical structure reader in Java

More languages

More languages
JavaScript
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
VB
C# (Xamarin)

Sample Java code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Java PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------

import java.util.Map;
import java.util.TreeMap;

import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.struct.*;
import com.pdftron.pdf.*;
import com.pdftron.sdf.*;

//---------------------------------------------------------------------------------------
// This sample explores the structure and content of a tagged PDF document and dumps 
// the structure information to the console window.
//
// In tagged PDF documents StructTree acts as a central repository for information 
// related to a PDF document's logical structure. The tree consists of StructElement-s
// and ContentItem-s which are leaf nodes of the structure tree.
//
// The sample can be extended to access and extract the marked-content elements such 
// as text and images.
//---------------------------------------------------------------------------------------
public class LogicalStructureTest {
    static void PrintIndent(int indent) {
        System.out.println();
        for (int i = 0; i < indent; ++i) System.out.print("  ");
    }

    // Used in code snippet 1.
    static void ProcessStructElement(SElement element, int indent) throws PDFNetException {
        if (!element.isValid()) {
            return;
        }

        // Print out the type and title info, if any.
        PrintIndent(indent++);
        System.out.print("Type: " + element.getType());
        if (element.hasTitle()) {
            System.out.print(". Title: " + element.getTitle());
        }

        int num = element.getNumKids();
        for (int i = 0; i < num; ++i) {
            // Check is the kid is a leaf node (i.e. it is a ContentItem).
            if (element.isContentItem(i)) {
                ContentItem cont = element.getAsContentItem(i);
                int type = cont.getType();

                Page page = cont.getPage();

                PrintIndent(indent);
                System.out.print("Content Item. Part of page #" + page.getIndex());

                PrintIndent(indent);
                switch (type) {
                    case ContentItem.e_MCID:
                    case ContentItem.e_MCR:
                        System.out.print("MCID: " + cont.getMCID());
                        break;
                    case ContentItem.e_OBJR: {
                        System.out.print("OBJR ");
                        Obj ref_obj = cont.getRefObj();
                        if (ref_obj != null)
                            System.out.print("- Referenced Object#: " + ref_obj.getObjNum());
                    }
                    break;
                    default:
                        break;
                }
            } else {  // the kid is another StructElement node.
                ProcessStructElement(element.getAsStructElem(i), indent);
            }
        }
    }

    // Used in code snippet 2.
    static void ProcessElements(ElementReader reader) throws PDFNetException {
        Element element;
        while ((element = reader.next()) != null)    // Read page contents
        {
            // In this sample we process only paths & text, but the code can be
            // extended to handle any element type.
            int type = element.getType();
            if (type == Element.e_path || type == Element.e_text || type == Element.e_path) {
                switch (type) {
                    case Element.e_path:                // Process path ...
                        System.out.print("\nPATH: ");
                        break;
                    case Element.e_text:                // Process text ...
                        System.out.print("\nTEXT: " + element.getTextString() + "\n");
                        break;
                    case Element.e_form:                // Process form XObjects
                        System.out.print("\nFORM XObject: ");
                        //reader.FormBegin();
                        //ProcessElements(reader);
                        //reader.End();
                        break;
                }

                // Check if the element is associated with any structural element.
                // Content items are leaf nodes of the structure tree.
                SElement struct_parent = element.getParentStructElement();
                if (struct_parent.isValid()) {
                    // Print out the parent structural element's type, title, and object number.
                    System.out.print(" Type: " + struct_parent.getType()
                            + ", MCID: " + element.getStructMCID());
                    if (struct_parent.hasTitle()) {
                        System.out.print(". Title: " + struct_parent.getTitle());
                    }
                    System.out.print(", Obj#: " + struct_parent.getSDFObj().getObjNum());
                }
            }
        }
    }

    // Used in code snippet 3.
    //typedef map<int, string> MCIDPageMap;
    //typedef map<int, MCIDPageMap> MCIDDocMap;

    // Used in code snippet 3.
    static void ProcessElements2(ElementReader reader, Map<Integer, String> mcid_page_map) throws PDFNetException {
        Element element;
        while ((element = reader.next()) != null) // Read page contents
        {
            // In this sample we process only text, but the code can be extended
            // to handle paths, images, or any other Element type.
            int mcid = element.getStructMCID();
            Integer key_mcid = new Integer(mcid);
            if (mcid >= 0 && element.getType() == Element.e_text) {
                String val = element.getTextString();
                if (mcid_page_map.containsKey(key_mcid))
                    mcid_page_map.put(key_mcid, ((String) (mcid_page_map.get(key_mcid)) + val));
                else mcid_page_map.put(key_mcid, val);
            }
        }
    }

    // Used in code snippet 3.
    static void ProcessStructElement2(SElement element, Map<Integer, Map<Integer, String>> mcid_doc_map, int indent) throws PDFNetException {
        if (!element.isValid()) {
            return;
        }

        // Print out the type and title info, if any.
        PrintIndent(indent);
        System.out.print("<" + element.getType());
        if (element.hasTitle()) {
            System.out.print(" title=\"" + element.getTitle() + "\"");
        }
        System.out.print(">");

        int num = element.getNumKids();
        for (int i = 0; i < num; ++i) {
            if (element.isContentItem(i)) {
                ContentItem cont = element.getAsContentItem(i);
                if (cont.getType() == ContentItem.e_MCID) {
                    int page_num = cont.getPage().getIndex();
                    Integer page_num_key = new Integer(page_num);
                    if (mcid_doc_map.containsKey(page_num_key)) {
                        Map<Integer, String> mcid_page_map = mcid_doc_map.get(page_num_key);
                        Integer mcid_key = new Integer(cont.getMCID());
                        if (mcid_page_map.containsKey(mcid_key)) {
                            System.out.print(mcid_page_map.get(mcid_key));
                        }
                    }
                }
            } else {  // the kid is another StructElement node.
                ProcessStructElement2(element.getAsStructElem(i), mcid_doc_map, indent + 1);
            }
        }

        PrintIndent(indent);
        System.out.print("</" + element.getType() + ">");
    }


    /**
     * @param args
     */
    public static void main(String[] args) {
        PDFNet.initialize(PDFTronLicense.Key());

        // Relative path to the folder containing test files.
        String input_path = "../../TestFiles/";
        String output_path = "../../TestFiles/Output/";

        try (PDFDoc doc = new PDFDoc((input_path + "tagged.pdf")))    // Extract logical structure from a PDF document
        {
            doc.initSecurityHandler();

            System.out.println("____________________________________________________________");
            System.out.println("Sample 1 - Traverse logical structure tree...");
            {
                STree tree = doc.getStructTree();
                if (tree.isValid()) {
                    System.out.println("Document has a StructTree root.");

                    for (int i = 0; i < tree.getNumKids(); ++i) {
                        // Recursively get structure  info for all all child elements.
                        ProcessStructElement(tree.getKid(i), 0);
                    }
                } else {
                    System.out.println("This document does not contain any logical structure.");
                }
            }
            System.out.println("\nDone 1.");

            System.out.println("____________________________________________________________");
            System.out.println("Sample 2 - Get parent logical structure elements from");
            System.out.println("layout elements.");
            {
                ElementReader reader = new ElementReader();
                for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
                    reader.begin(itr.next());
                    ProcessElements(reader);
                    reader.end();
                }
            }
            System.out.println("\nDone 2.");

            System.out.println("____________________________________________________________");
            System.out.println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
            {
                //A map which maps page numbers(as Integers)
                //to page Maps(which map from struct mcid(as Integers) to
                //text Strings)
                Map<Integer, Map<Integer, String>> mcid_doc_map = new TreeMap<Integer, Map<Integer, String>>();
                ElementReader reader = new ElementReader();
                for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
                    Page current = itr.next();
                    reader.begin(current);
                    Map<Integer, String> page_mcid_map = new TreeMap<Integer, String>();
                    mcid_doc_map.put(new Integer(current.getIndex()), page_mcid_map);
                    ProcessElements2(reader, page_mcid_map);
                    reader.end();
                }

                STree tree = doc.getStructTree();
                if (tree.isValid()) {
                    for (int i = 0; i < tree.getNumKids(); ++i) {
                        ProcessStructElement2(tree.getKid(i), mcid_doc_map, 0);
                    }
                }
            }
            System.out.println("\nDone 3.");
            doc.save((output_path + "LogicalStructure.pdf"), SDFDoc.SaveMode.LINEARIZED, null);
        } catch (Exception e) {
            e.printStackTrace();
        }

        PDFNet.terminate();
    }

}