//--------------------------------------------------------------------------------------- // Copyright (c) 2001-2012 by PDFTron Systems Inc. All Rights Reserved. // Consult legal.txt regarding legal and license information. //--------------------------------------------------------------------------------------- #import #import // This sample illustrates the basic text extraction capabilities of PDFNet. // A utility method used to dump all text content in the console window. void DumpAllText(ElementReader *reader) { Element *element; while ((element = [reader Next]) != NULL) { switch ([element GetType]) { case e_text_begin: NSLog(@"--> Text Block Begin"); break; case e_text_end: NSLog(@"--> Text Block End"); break; case e_text: { PDFRect *bbox = [element GetBBox]; NSLog(@"--> BBox: %f, %f, %f, %f", [bbox GetX1], [bbox GetY1], [bbox GetX2], [bbox GetY2]); NSLog(@"%@", [element GetTextString]); } break; case e_text_new_line: NSLog(@"--> New Line"); break; case e_form: // Process form XObjects [reader FormBegin]; DumpAllText(reader); [reader End]; break; } } } // A helper method for ReadTextFromRect void RectTextSearch(ElementReader *reader, PDFRect *pos, NSString *srch_str) { Element *element; while ((element = [reader Next]) != NULL) { switch ([element GetType]) { case e_text: { PDFRect *bbox = [element GetBBox]; if([bbox IntersectRect: bbox rect2: pos]) { NSString *arr = [element GetTextString]; srch_str = [srch_str stringByAppendingString: arr]; srch_str = [srch_str stringByAppendingString: @"\n"]; // add a new line? } break; } case e_text_new_line: { break; } case e_form: // Process form XObjects { [reader FormBegin]; RectTextSearch(reader, pos, srch_str); [reader End]; break; } } } } // A utility method used to extract all text content from // a given selection rectangle. The rectangle coordinates are // expressed in PDF user/page coordinate system. NSString* ReadTextFromRect(Page *page, PDFRect *pos, ElementReader *reader) { NSString *srch_str; [reader Begin: page]; RectTextSearch(reader, pos, srch_str); [reader End]; return srch_str; } void PrintStyle(TextExtractorStyle *s) { NSMutableArray *rgb = [s GetColor]; NSLog(@" style=\"font-family:%@; font-size:%f; sans-serif: %d; color: #%@, %@, %@\"", [s GetFontName], [s GetFontSize], [s IsSerif], [rgb objectAtIndex: 0], [rgb objectAtIndex: 1], [rgb objectAtIndex: 2]); } int main(int argc, char *argv[]) { NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; int ret = 0; [PDFNet Initialize: 0]; bool example1_basic = true; bool example2_xml = true; bool example3_wordlist = true; bool example4_advanced = true; bool example5_low_level = false; // Sample code showing how to use high-level text extraction APIs. @try { PDFDoc *doc = [[[PDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"] autorelease]; [doc InitSecurityHandler]; Page *page = [doc GetPage: 1]; if (!page){ NSLog(@"Page not found."); return 1; } TextExtractor *txt = [[[TextExtractor alloc] init] autorelease]; [txt Begin: page clip_ptr: 0 flags: 0]; // Read the page. // Other options you may want to consider... // txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove); // txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text); // Example 1. Get all text on the page in a single string. // Words will be separated with space or new line characters. if (example1_basic) { // Get the word count. NSLog(@"Word Count: %d", [txt GetWordCount]); NSString *text = [txt GetAsText: YES]; NSLog(@"\n\n- GetAsText --------------------------\n%@", text); NSLog(@"-----------------------------------------------------------"); } // Example 2. Get XML logical structure for the page. if (example2_xml) { NSString *text = [txt GetAsXML: e_words_as_elements | e_output_bbox | e_output_style_info]; NSLog(@"\n\n- GetAsXML --------------------------\n %@", text); NSLog(@"-----------------------------------------------------------"); } // Example 3. Extract words one by one. if (example3_wordlist) { TextExtractorLine *line = [txt GetFirstLine]; Word *word; for (; [line IsValid]; line=[line GetNextLine]) { for (word=[line GetFirstWord]; [word IsValid]; word=[word GetNextWord]) { NSLog(@"%@", [word GetString]); } } NSLog(@"-----------------------------------------------------------"); } // Example 4. A more advanced text extraction example. // The output is XML structure containing paragraphs, lines, words, // as well as style and positioning information. if (example4_advanced) { PDFRect *b, *q; int cur_flow_id=-1, cur_para_id=-1; NSString *uni_str; TextExtractorLine *line; Word *word; TextExtractorStyle *s, *line_style; // For each line on the page... for (line=[txt GetFirstLine]; [line IsValid]; line=[line GetNextLine]) { if ( [line GetNumWords] == 0 ) { continue; } if (cur_flow_id != [line GetFlowID]) { if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; NSLog(@""); } NSLog(@"\n"); } cur_flow_id = [line GetFlowID]; NSLog(@"\n", cur_flow_id); } if (cur_para_id != [line GetParagraphID]) { if (cur_para_id != -1) NSLog(@"\n"); cur_para_id = [line GetParagraphID]; NSLog(@"\n", cur_para_id); } b = [line GetBBox]; line_style = [line GetStyle]; NSLog(@"\n"); // For each word in the line... for (word=[line GetFirstWord]; [word IsValid]; word=[word GetNextWord]) { // Output the bounding box for the word. q = [word GetBBox]; NSLog(@"%@", uni_str); NSLog(@"\n"); } NSLog(@"\n"); } if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; NSLog(@"\n"); } NSLog(@"\n"); } } } @catch(NSException *e) { NSLog(@"%@", [e reason]); ret = 1; } if(example5_low_level) { @try { PDFDoc *doc = [[[PDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"] autorelease]; [doc InitSecurityHandler]; // Example 1. Extract all text content from the document ElementReader *reader = [[[ElementReader alloc] init] autorelease]; // Read every page PageIterator *itr; for (itr=[doc GetPageIterator: 1]; [itr HasNext]; [itr Next]) { [reader Begin: [itr Current]]; DumpAllText(reader); [reader End]; } // Example 2. Extract text content based on the // selection rectangle. NSLog(@"\n----------------------------------------------------"); NSLog(@"\nExtract text based on the selection rectangle."); NSLog(@"\n----------------------------------------------------\n"); Page *first_page = [[doc GetPageIterator: 1] Current]; PDFRect *rect1 = [[[PDFRect alloc] initWithX1: 27 y1: 392 x2: 563 y2: 534] autorelease]; NSString *s1 = ReadTextFromRect(first_page, rect1, reader); NSLog(@"\nField 1: %@", s1); PDFRect *rect2 = [[[PDFRect alloc] initWithX1: 28 y1: 551 x2: 106 y2: 623] autorelease]; s1 = ReadTextFromRect(first_page, rect2, reader); NSLog(@"\nField 2: %@", s1); PDFRect *rect3 = [[[PDFRect alloc] initWithX1: 208 y1: 550 x2: 387 y2: 621] autorelease]; s1 = ReadTextFromRect(first_page, rect3, reader); NSLog(@"\nField 3: %@", s1); // ... NSLog(@"Done."); } @catch(NSException *e) { NSLog(@"%@", [e reason]); ret = 1; } } [pool release]; return ret; }