< iOS samples

TextExtractTest - Swift

The sample illustrates the basic text extraction capabilities of PDFNet.

//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2017 by PDFTron Systems Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------

import PDFNet
import Foundation

// This sample illustrates the basic text extraction capabilities of PDFNet.

// A utility method used to dump all text content in the console window.
func DumpAllText(_ reader: PTElementReader) {
    while let element = reader.next() {
        switch element.getType() {
        case e_pttext_begin:
            print("--> Text Block Begin")
        case e_pttext_end:
            print("--> Text Block End")
        case e_pttext_obj:
            let bbox: PTPDFRect = element.getBBox()
            print("--> BBox: \(bbox.getX1()), \(bbox.getY1()), \(bbox.getX2()), \(bbox.getY2())")
            print("\(element.getTextString()!)")
        case e_pttext_new_line:
            print("--> New Line")
        case e_ptform:
            // Process form XObjects
            reader.formBegin()
            DumpAllText(reader)
            reader.end()
        default:
            break
        }
    }
}

// A helper method for ReadTextFromRect
func RectTextSearch(reader: PTElementReader, pos: PTPDFRect, srch_str: inout String) {
    while let element = reader.next() {
        switch element.getType() {
        case e_pttext_obj:
            let bbox: PTPDFRect = element.getBBox()
            if bbox.intersect(bbox, rect2: pos) {
                let arr = element.getTextString()
                srch_str += (arr ?? "")
                srch_str += ("\n")    // add a new line?
            }
        case e_pttext_new_line:
            break
        case e_ptform:
            // Process form XObjects
            reader.formBegin()
            RectTextSearch(reader: reader, pos: pos, srch_str:  &srch_str)
            reader.end()
        default:
            break
        }
    }
}

// A utility method used to extract all text content from
// a given selection rectangle. The rectangle coordinates are
// expressed in PDF user/page coordinate system.
func ReadTextFromRect(page: PTPage, pos: PTPDFRect, reader: PTElementReader) -> String {
    var srch_str = ""
    reader.begin(page)
    RectTextSearch(reader: reader, pos: pos, srch_str:  &srch_str)
    reader.end()
    return srch_str
}

func PrintStyle(_ s: PTTextExtractorStyle) {
    let rgb: NSMutableArray = s.getColor()
    print(" style=\"font-family:\(s.getFontName()!); font-size:\(s.getFontSize()); sans-serif: \(s.isSerif()); color: #\(rgb[0]), \(rgb[1]), \(rgb[2])\"")
}

func runTextExtractTest() -> Int {
    return autoreleasepool {
        var ret = 0
        
        
        let example1_basic = true
        let example2_xml = true
        let example3_wordlist = true
        let example4_advanced = true
        let example5_low_level = false
        
        // Sample code showing how to use high-level text extraction APIs.
        do {
            try PTPDFNet.catchException {
                let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "newsletter", ofType: "pdf"))
                doc.initSecurityHandler()
                
                guard let page: PTPage = doc.getPage(1) else {
                    print("Page not found.")
                    ret = 1
                    return
                }
                
                let txt: PTTextExtractor = PTTextExtractor()
                txt.begin(page, clip_ptr: nil, flags: 0)    // Read the page.
                // Other options you may want to consider...
                // txt.begin(page, nil, e_ptno_dup_remove);
                // txt.begin(page, nil, e_ptremove_hidden_text);
                
                // Example 1. Get all text on the page in a single string.
                // Words will be separated with space or new line characters.
                if example1_basic {
                    // Get the word count.
                    print("Word Count: \(txt.getWordCount())")
                    
                    let text: String = txt.getAsText(true)
                    print("\n\n- GetAsText --------------------------\n\(text)")
                    print("-----------------------------------------------------------")
                }
                
                // Example 2. Get XML logical structure for the page.
                if example2_xml {
                    let text: String = txt.getAsXML(e_ptwords_as_elements.rawValue | e_ptoutput_bbox.rawValue | e_ptoutput_style_info.rawValue)
                    print("\n\n- GetAsXML  --------------------------\n\(text)")
                    print("-----------------------------------------------------------")
                }
                
                // Example 3. Extract words one by one.
                if example3_wordlist {
                    var line: PTTextExtractorLine = txt.getFirstLine()
                    while line.isValid() {
                        var word: PTWord = line.getFirstWord()
                        while word.isValid() {
                            print("\(word.getString()!)")
                            word = word.getNext()
                        }
                        line = line.getNext()
                    }
                    print("-----------------------------------------------------------")
                }
                
                // Example 4. A more advanced text extraction example.
                // The output is XML structure containing paragraphs, lines, words,
                // as well as style and positioning information.
                if example4_advanced {
                    var b: PTPDFRect
                    var q: PTPDFRect
                    var cur_flow_id = -1
                    var cur_para_id = -1
                    
                    var uni_str = ""
                    var line: PTTextExtractorLine
                    var word: PTWord
                    var s: PTTextExtractorStyle
                    var line_style: PTTextExtractorStyle
                    
                    // For each line on the page...
                    line = txt.getFirstLine()
                    while line.isValid() {
                        if line.getNumWords() == 0 {
                            continue
                        }
                        if cur_flow_id != line.getFlowID() {
                            if cur_flow_id != -1 {
                                if cur_para_id != -1 {
                                    cur_para_id = -1
                                    print("</Para>")
                                }
                                print("</Flow>\n")
                            }
                            cur_flow_id = Int(line.getFlowID())
                            print("<Flow id=\", \(cur_flow_id)\">\n")
                        }
                        if cur_para_id != line.getParagraphID() {
                            if cur_para_id != -1 {
                                print("</Para>\n")
                            }
                            cur_para_id = Int(line.getParagraphID())
                            print("<Para id=\", \(cur_para_id)\">\n")
                        }
                        
                        b = line.getBBox()
                        line_style = line.getStyle()
                        print("<Line box=\"\(b.getX1()), \(b.getY1()), \(b.getX2()), \(b.getY2())\"")
                        PrintStyle(line_style)
                        print(">\n")
                        
                        // For each word in the line...
                        word = line.getFirstWord()
                        while word.isValid() {
                            // Output the bounding box for the word.
                            q = word.getBBox()
                            print("<Word box=\"\(q.getX1()), \(q.getY1()), \(q.getX2()), \(q.getY2())\"")
                            let sz = word.getStringLen()
                            if sz == 0 {
                                continue
                            }
                            
                            // If the word style is different from the parent style, output the new style.
                            s = word.getStyle()
                            if s != line_style {
                                PrintStyle(s)
                            }
                            
                            uni_str = word.getString()
                            print(">\(uni_str)")
                            print("</Word>\n")
                            word = word.getNext()
                        }
                        print("</Line>\n")
                        line = line.getNext()
                    }
                    if cur_flow_id != -1 {
                        if cur_para_id != -1 {
                            cur_para_id = -1
                            print("</Para>\n")
                        }
                        print("</Flow>\n")
                    }
                }
            }
        } catch let e as NSError {
            print("\(e)")
            ret = 1
        }
        
        if example5_low_level {
//            do {
//                try PTPDFNet.catchException {
//                    let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "newsletter", ofType: "pdf"))
//                    doc.initSecurityHandler()
//
//                    // Example 1. Extract all text content from the document
//
//                    let reader: PTElementReader = PTElementReader()
//                    //  Read every page
//                    let itr: PTPageIterator = doc.getPageIterator(1)
//                    while itr.hasNext() {
//                        reader.begin(itr.current())
//                        DumpAllText(reader)
//                        reader.end()
//                        itr.next()
//                    }
//
//                    // Example 2. Extract text content based on the
//                    // selection rectangle.
//                    print("\n----------------------------------------------------")
//                    print("\nExtract text based on the selection rectangle.")
//                    print("\n----------------------------------------------------\n")
//
//                    let first_page: PTPage = doc.getPageIterator(1).current()
//                    let rect1: PTPDFRect = PTPDFRect(x1: 27, y1: 392, x2: 563, y2: 534)
//                    var s1: String = ReadTextFromRect(page: first_page, pos: rect1, reader: reader)
//                    print("\nField 1: \(s1)")
//
//                    let rect2: PTPDFRect = PTPDFRect(x1: 28, y1: 551, x2: 106, y2: 623)
//                    s1 = ReadTextFromRect(page: first_page, pos: rect2, reader: reader)
//                    print("\nField 2: \(s1)")
//
//                    let rect3: PTPDFRect = PTPDFRect(x1: 208, y1: 550, x2: 387, y2: 621)
//                    s1 = ReadTextFromRect(page: first_page, pos: rect3, reader: reader)
//                    print("\nField 3: \(s1)")
//
//                    // ...
//                    print("Done.")
//                }
//            } catch let e as NSError {
//                print("\(e)")
//                ret = 1
//            }
        }
    
        return ret
    }
}