//--------------------------------------------------------------------------------------- // Copyright (c) 2001-2008 by PDFTron Systems Inc. All Rights Reserved. // Consult legal.txt regarding legal and license information. //--------------------------------------------------------------------------------------- // A sample project illustrating some extraction capabilities of ElementReader // in more detail //--------------------------------------------------------------------------------------- using System; using pdftron; using pdftron.Common; using pdftron.Filters; using pdftron.SDF; using pdftron.PDF; namespace ElementReaderAdvTestCS { /// /// Summary description for Class1. /// class Class1 { // Relative path to the folder containing test files. static string input_path = "../../../../TestFiles/"; static string output_path = "../../../../TestFiles/Output/"; static string m_buf; static public void ProcessPath(ElementReader reader, Element path) { if (path.IsClippingPath()) { Console.WriteLine("This is a clipping path"); } double[] data = path.GetPathPoints(); int data_sz = path.GetPointCount(); byte[] opr = path.GetPathTypes(); int opr_sz = path.GetPathTypesCount(); int opr_itr = 0, opr_end = opr_sz; int data_itr = 0, data_end = data_sz; double x1, y1, x2, y2, x3, y3; // Use path.GetCTM() if you are interested in CTM (current transformation matrix). Console.Write(" Path Data Points := \""); for ( ; opr_itr < opr_end; ++opr_itr) { switch((Element.PathSegmentType)((int)opr[opr_itr])) { case Element.PathSegmentType.e_moveto: x1 = data[data_itr]; ++data_itr; y1 = data[data_itr]; ++data_itr; m_buf = string.Format("M{0:g5} {1:g5}", x1, y1); Console.Write(m_buf); break; case Element.PathSegmentType.e_lineto: x1 = data[data_itr]; ++data_itr; y1 = data[data_itr]; ++data_itr; m_buf = string.Format(" L{0:g5} {1:g5}", x1, y1); Console.Write(m_buf); break; case Element.PathSegmentType.e_cubicto: x1 = data[data_itr]; ++data_itr; y1 = data[data_itr]; ++data_itr; x2 = data[data_itr]; ++data_itr; y2 = data[data_itr]; ++data_itr; x3 = data[data_itr]; ++data_itr; y3 = data[data_itr]; ++data_itr; m_buf = string.Format(" C{0:g5} {1:g5} {2:g5} {3:g5} {4:g5} {5:g5}", new object[] {x1, y1, x2, y2, x3, y3}); Console.Write(m_buf); break; case Element.PathSegmentType.e_rect: { x1 = data[data_itr]; ++data_itr; y1 = data[data_itr]; ++data_itr; double w = data[data_itr]; ++data_itr; double h = data[data_itr]; ++data_itr; x2 = x1 + w; y2 = y1; x3 = x2; y3 = y1 + h; double x4 = x1; double y4 = y3; m_buf = string.Format("M{0:g5} {1:g5} L{2:g5} {3:g5} L{4:g5} {5:g5} L{6:g5} {7:g5} Z", new object[] {x1, y1, x2, y2, x3, y3, x4, x3}); Console.Write(m_buf); break; } case Element.PathSegmentType.e_closepath: Console.WriteLine(" Close Path"); break; default: System.Diagnostics.Debug.Assert(false); break; } } Console.Write("\" "); GState gs = path.GetGState(); // Set Path State 0 (stroke, fill, fill-rule) ----------------------------------- if (path.IsStroked()) { Console.WriteLine("Stroke path"); if (gs.GetStrokeColorSpace().GetType() == ColorSpace.Type.e_pattern) { Console.WriteLine("Path has associated pattern"); } else { // Get stroke color (you can use PDFNet color conversion facilities) // ColorPt rgb; // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb); } } else { // Do not stroke path } if (path.IsFilled()) { Console.WriteLine("Fill path"); if (gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern) { Console.WriteLine("Path has associated pattern"); } else { // ColorPt rgb; // gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb); } } else { // Do not fill path } // Process any changes in graphics state --------------------------------- GSChangesIterator gs_itr = reader.GetChangesIterator(); for ( ; gs_itr.HasNext(); gs_itr.Next()) { switch(gs_itr.Current()) { case GState.GStateAttribute.e_transform : // Get transform matrix for this element. Unlike path.GetCTM() // that return full transformation matrix gs.GetTransform() return // only the transformation matrix that was installed for this element. // // gs.GetTransform(); break; case GState.GStateAttribute.e_line_width : // gs.GetLineWidth(); break; case GState.GStateAttribute.e_line_cap : // gs.GetLineCap(); break; case GState.GStateAttribute.e_line_join : // gs.GetLineJoin(); break; case GState.GStateAttribute.e_flatness : break; case GState.GStateAttribute.e_miter_limit : // gs.GetMiterLimit(); break; case GState.GStateAttribute.e_dash_pattern : { // double[] dashes; // gs.GetDashes(dashes); // gs.GetPhase() break; } } } } static public void ProcessText(ElementReader page_reader) { // Begin text element Console.WriteLine("Begin Text Block:"); Element element; while ((element = page_reader.Next()) != null) { switch (element.GetType()) { case Element.Type.e_text_end: // Finish the text block Console.WriteLine("End Text Block."); return; case Element.Type.e_text: { GState gs = element.GetGState(); ColorSpace cs_fill = gs.GetFillColorSpace(); ColorPt fill = gs.GetFillColor(); ColorPt outc = new ColorPt(); cs_fill.Convert2RGB(fill, outc); ColorSpace cs_stroke = gs.GetStrokeColorSpace(); ColorPt stroke = gs.GetStrokeColor(); Font font = gs.GetFont(); Console.Write("Font Name: "); Console.Write(font.GetName()); // font.IsFixedWidth(); // font.IsSerif(); // font.IsSymbolic(); // font.IsItalic(); // ... // double word_spacing = gs.GetWordSpacing(); // double char_spacing = gs.GetCharSpacing(); // Use element.GetCTM() if you are interested in the CTM // (current transformation matrix). Matrix2D ctm = element.GetCTM(); Matrix2D text_mtx = element.GetTextMatrix(); Matrix2D mtx = ctm * text_mtx; double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d); double font_size = gs.GetFontSize(); Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size); ColorPt font_color = gs.GetFillColor(); ColorSpace cs = gs.GetFillColorSpace(); ColorPt rgb = new ColorPt(); cs.Convert2RGB(font_color, rgb); //Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255), // (byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255)); Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", (byte)(rgb.Get(0)*255), (byte)(rgb.Get(1)*255), (byte)(rgb.Get(2)*255)); double x, y; int char_code; for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) { Console.Write("Character code: "); char_code = itr.Current().char_code; Console.Write((char)char_code); x = itr.Current().x; // character positioning information y = itr.Current().y; // To get the exact character positioning information you need to // concatenate current text matrix with CTM and then multiply // relative positioning coordinates with the resulting matrix. // mtx = ctm * text_mtx; mtx.Mult(ref x, ref y); Console.WriteLine(" Position: x={0:f} y={1:f}", x, y); } Console.WriteLine(); break; } } } } static int image_counter = 0; static public void ProcessImage(Element image) { bool image_mask = image.IsImageMask(); bool interpolate = image.IsImageInterpolate(); int width = image.GetImageWidth(); int height = image.GetImageHeight(); int out_data_sz = width * height * 3; Console.WriteLine("Image: width=\"{0:d}\" height=\"{1:d}\"", width, height); // Matrix2D mtx = image.GetCTM(); // image matrix (page positioning info) ++image_counter; System.Drawing.Bitmap bmp = image.GetBitmap(); bmp.Save(output_path + "reader_img_extract_" + image_counter.ToString() + ".png", System.Drawing.Imaging.ImageFormat.Png); // Alternatively you can use GetImageData to read the raw (decoded) image data // image.GetBitsPerComponent(); // image.GetImageData(); // get raw image data // another approach is to use Image2RGB filter that converts every image to // RGB format. This could save you time since you don't need to deal with color // conversions, image up-sampling, decoding etc. // ---------------- // Image2RGB img_conv = new Image2RGB(image); // Extract and convert image to RGB 8-bpc format // FilterReader reader = new FilterReader(img_conv); // // byte[] image_data_out = new byte[out_data_sz]; // A buffer used to keep image data. // reader.Read(image_data_out); // image_data_out contains RGB image data. // ---------------- // Note that you don't need to read a whole image at a time. Alternatively // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) // until the function returns 0. } static void ProcessElements(ElementReader reader) { Element element; while ((element = reader.Next()) != null) // Read page contents { switch (element.GetType()) { case Element.Type.e_path: // Process path data... { ProcessPath(reader, element); break; } case Element.Type.e_text_begin: // Process text strings... { ProcessText(reader); break; } case Element.Type.e_form: // Process form XObjects { reader.FormBegin(); ProcessElements(reader); reader.End(); break; } case Element.Type.e_image: // Process Images { ProcessImage(element); break; } } } } /// /// The main entry point for the application. /// [STAThread] static void Main(string[] args) { PDFNet.Initialize(); PDFNet.SetResourcesPath("../../../../../resources"); // Relative path to the folder containing test files. string input_path = "../../../../TestFiles/"; // string output_path = "../../../../TestFiles/Output/"; try { Console.WriteLine("-------------------------------------------------"); Console.WriteLine("Extract page element information from all"); Console.WriteLine("pages in the document."); // Open the test file Console.WriteLine("Opening the input file..."); PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"); doc.InitSecurityHandler(); int pgnum = doc.GetPageCount(); PageIterator itr; ElementReader page_reader = new ElementReader(); for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) // Read every page { Console.WriteLine("Page {0:d} ----------------------------------------", itr.GetPageNumber()); Rect crop_box = itr.Current().GetCropBox(); crop_box.Normalize(); Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2); Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height()); page_reader.Begin(itr.Current()); ProcessElements(page_reader); page_reader.End(); } doc.Close(); Console.WriteLine("Done."); } catch (PDFNetException e) { Console.WriteLine(e.Message); } PDFNet.Terminate(); } } }