//--------------------------------------------------------------------------------------- // Copyright (c) 2001-2012 by PDFTron Systems Inc. All Rights Reserved. // Consult legal.txt regarding legal and license information. //--------------------------------------------------------------------------------------- #import #import char m_buf[4000]; void ProcessElements(ElementReader *reader); void ProcessPath(ElementReader *reader, Element *path) { if ([path IsClippingPath]) { NSLog(@"This is a clipping path"); } PathData* pathData = [path GetPathData]; NSMutableArray* data = [pathData GetPoints]; NSData* opr = [pathData GetOperators]; int opr_index = 0; int opr_end = [opr length]; int data_index = 0; int data_end = [data count]; double x1, y1, x2, y2, x3, y3; NSString *str = @""; // Use path.GetCTM() if you are interested in CTM (current transformation matrix). unsigned char* opr_data = (unsigned char*)[opr bytes]; str = [str stringByAppendingFormat: @" Path Data Points := \""]; for (; opr_index dashes; // gs.GetDashes(dashes); // gs.GetPhase() } break; case e_fill_color: { if ( [[gs GetFillColorSpace] GetType] == e_pattern && [[gs GetFillPattern] GetType] != e_shading ) { //process the pattern data [reader PatternBegin: YES reset_ctm_tfm: NO]; ProcessElements(reader); [reader End]; } } break; } } [reader ClearChangeList]; NSLog(@"%@", str); } void ProcessText(ElementReader* page_reader) { // Begin text element NSLog(@"Begin Text Block:"); Element *element; while ((element = [page_reader Next]) != NULL) { switch ([element GetType]) { case e_text_end: // Finish the text block //str = [str stringByAppendingString: @"End Text Block.\n"]; NSLog(@"End Text Block."); return; case e_text_obj: { GState *gs = [element GetGState]; ColorSpace *cs_fill = [gs GetFillColorSpace]; ColorPt *fill = [gs GetFillColor]; ColorPt *outColor = [cs_fill Convert2RGB: fill]; ColorSpace *cs_stroke = [gs GetStrokeColorSpace]; ColorPt *stroke = [gs GetStrokeColor]; Font *font = [gs GetFont]; NSLog(@"Font Name: %@\n", [font GetName]); // font.IsFixedWidth(); // font.IsSerif(); // font.IsSymbolic(); // font.IsItalic(); // ... // double font_size = gs.GetFontSize(); // double word_spacing = gs.GetWordSpacing(); // double char_spacing = gs.GetCharSpacing(); // const UString* txt = element.GetTextString(); if ( [font GetType] == e_Type3 ) { //type 3 font, process its data CharIterator *itr; for (itr = [element GetCharIterator]; [itr HasNext]; [itr Next]) { [page_reader Type3FontBegin: [itr Current] resource_dict: 0]; ProcessElements(page_reader); [page_reader End]; } } else { Matrix2D *text_mtx = [element GetTextMatrix]; double x, y; unsigned int char_code; CharIterator *itr; NSString* str = @""; for (itr = [element GetCharIterator]; [itr HasNext]; [itr Next]) { char_code = [[itr Current] getChar_code]; if (char_code>=32 || char_code<=255) { // Print if in ASCII range... str = [str stringByAppendingFormat: @"%c", char_code]; } x = [[itr Current] getX]; // character positioning information y = [[itr Current] getY]; // Use element.GetCTM() if you are interested in the CTM // (current transformation matrix). Matrix2D *ctm = [element GetCTM]; // To get the exact character positioning information you need to // concatenate current text matrix with CTM and then multiply // relative positioning coordinates with the resulting matrix. Matrix2D *mtx = text_mtx; [mtx Concat: [ctm getM_a] b: [ctm getM_b] c: [ctm getM_c] d: [ctm getM_d] h: [ctm getM_h] v: [ctm getM_v]]; [mtx Mult: [[[PDFPoint alloc] initWithPx: x py: y] autorelease]]; // Get glyph path... //vector oprs; //vector glyph_data; //font.GetGlyphPath(char_code, oprs, glyph_data, false, 0); } NSLog(@"%@", str); } //str = [str stringByAppendingString: @"\n"]; } break; } } } void ProcessImage(Element *image) { bool image_mask = [image IsImageMask]; bool interpolate = [image IsImageInterpolate]; int width = [image GetImageWidth]; int height = [image GetImageHeight]; int out_data_sz = width * height * 3; NSLog(@"Image: width=\"%d\" height=\"%d", width, height); // Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info) // You can use GetImageData to read the raw (decoded) image data //image->GetBitsPerComponent(); //image->GetImageData(); // get raw image data // .... or use Image2RGB filter that converts every image to RGB format, // This should save you time since you don't need to deal with color conversions, // image up-sampling, decoding etc. Image2RGB *img_conv = [[[Image2RGB alloc] initWithImage_element: image] autorelease]; // Extract and convert image to RGB 8-bpc format FilterReader *reader = [[[FilterReader alloc] initWithFilter: img_conv] autorelease]; // A buffer used to keep image data. NSData *image_data_out = [reader Read: out_data_sz]; // &image_data_out.front() contains RGB image data. // Note that you don't need to read a whole image at a time. Alternatively // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) // until the function returns 0. } void ProcessElements(ElementReader *reader) { Element *element; while ((element = [reader Next]) != NULL) // Read page contents { switch ([element GetType]) { case e_path: // Process path data... { ProcessPath(reader, element); } break; case e_text_begin: // Process text block... { ProcessText(reader); } break; case e_form: // Process form XObjects { [reader FormBegin]; ProcessElements(reader); [reader End]; } break; case e_image: // Process Images { ProcessImage(element); } break; } } } int main(int argc, char *argv[]) { NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; int ret = 0; [PDFNet Initialize: 0]; @try // Extract text data from all pages in the document { NSLog(@"__________________________________________________"); NSLog(@"Extract page element information from all "); NSLog(@"pages in the document."); PDFDoc *doc = [[[PDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"] autorelease]; [doc InitSecurityHandler]; int pgnum = [doc GetPageCount]; PageIterator *page_begin = [doc GetPageIterator: 1]; ElementReader *page_reader = [[[ElementReader alloc] init] autorelease]; PageIterator *itr; for (itr = page_begin; [itr HasNext]; [itr Next]) // Read every page { NSLog(@"Page %d----------------------------------------", [[itr Current] GetIndex]); [page_reader Begin: [itr Current]]; ProcessElements(page_reader); [page_reader End]; } NSLog(@"Done."); } @catch(NSException *e) { NSLog(@"%@", [e reason]); ret = 1; } [pool release]; return ret; }