' ' Copyright (c) 2001-2012 by PDFTron Systems Inc. All Rights Reserved. ' ' A sample project illustrating some extraction capabilities of ElementReader ' in more detail ' Imports System Imports pdftron Imports pdftron.Common Imports pdftron.Filters Imports pdftron.SDF Imports pdftron.PDF Module Module1 Dim m_buf As String Sub ProcessPath(ByRef reader As ElementReader, ByRef path As Element) If path.IsClippingPath() Then Console.WriteLine("This is a clipping path") End If Dim pathData As PathData = path.GetPathData() Dim data As Double() = pathData.points Dim data_sz As Integer = data.Length Dim opr As Byte() = pathData.operators Dim opr_sz As Integer = opr.Length Dim opr_itr As Integer = 0 Dim opr_end As Integer = opr_sz Dim data_itr As Integer = 0 Dim data_end As Integer = data_sz Dim x1, y1, x2, y2, x3, y3 As Double ' Use path.GetCTM() if you are interested in CTM (current transformation matrix). Console.Write(" Path Data Points := \") While opr_itr < opr_end 'switch((Element.PathSegmentType)((int)opr[opr_itr])) If opr(opr_itr) = pathData.PathSegmentType.e_moveto Then x1 = data(data_itr) data_itr += 1 y1 = data(data_itr) data_itr += 1 m_buf = String.Format("M{0:g5} {1:g5}", x1, y1) Console.Write(m_buf) ElseIf opr(opr_itr) = pathData.PathSegmentType.e_lineto Then x1 = data(data_itr) data_itr += 1 y1 = data(data_itr) data_itr += 1 m_buf = String.Format(" L{0:g5} {1:g5}", x1, y1) Console.Write(m_buf) ElseIf opr(opr_itr) = pathData.PathSegmentType.e_cubicto Then x1 = data(data_itr) data_itr += 1 y1 = data(data_itr) data_itr += 1 x2 = data(data_itr) data_itr += 1 y2 = data(data_itr) data_itr += 1 x3 = data(data_itr) data_itr += 1 y3 = data(data_itr) data_itr += 1 Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3} m_buf = String.Format(" C{0:g5} {1:g5} {2:g5} {3:g5} {4:g5} {5:g5}", _ coords) Console.Write(m_buf) ElseIf opr(opr_itr) = pathData.PathSegmentType.e_rect Then x1 = data(data_itr) data_itr += 1 y1 = data(data_itr) data_itr += 1 Dim w As Double = data(data_itr) data_itr += 1 Dim h As Double = data(data_itr) data_itr += 1 x2 = x1 + w y2 = y1 x3 = x2 y3 = y1 + h Dim x4 As Double = x1 Dim y4 As Double = y3 Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3, x4, x3} m_buf = String.Format("M{0:g5} {1:g5} L{2:g5} {3:g5} L{4:g5} {5:g5} L{6:g5} {7:g5} Z", _ coords) Console.Write(m_buf) ElseIf opr(opr_itr) = pathData.PathSegmentType.e_closepath Then Console.WriteLine(" Close Path") Else System.Diagnostics.Debug.Assert(False) End If opr_itr += 1 End While Console.Write(""" ") Dim gs As GState = path.GetGState() ' Set Path State 0 (stroke, fill, fill-rule) ----------------------------------- If path.IsStroked() Then Console.WriteLine("Stroke path") If gs.GetStrokeColorSpace().GetType() = ColorSpace.Type.e_pattern Then Console.WriteLine("Path has associated pattern") Else ' Get stroke color (you can use PDFNet color conversion facilities) ' Dim rgb As ColorPt ' gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb) End If Else ' Do not stroke path End If If path.IsFilled() Then Console.WriteLine("Fill path") If gs.GetFillColorSpace().GetType() = ColorSpace.Type.e_pattern Then Console.WriteLine("Path has associated pattern") Else ' Dim rgb As ColorPt ' gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb) End If Else ' Do not fill path End If ' Process any changes in graphics state --------------------------------- Dim gs_itr As GSChangesIterator = reader.GetChangesIterator() While gs_itr.HasNext() If gs_itr.Current() = GState.GStateAttribute.e_transform Then ' Get transform matrix for this element. Unlike path.GetCTM() ' that return full transformation matrix gs.GetTransform() return ' only the transformation matrix that was installed for this element. ' ' gs.GetTransform() ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_width Then ' gs.GetLineWidth() ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_cap Then ' gs.GetLineCap() ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_join Then ' gs.GetLineJoin() ElseIf gs_itr.Current() = GState.GStateAttribute.e_flatness Then ElseIf gs_itr.Current() = GState.GStateAttribute.e_miter_limit Then ' gs.GetMiterLimit() ElseIf gs_itr.Current() = GState.GStateAttribute.e_dash_pattern Then ' Dim dashes As Double() ' gs.GetDashes(dashes) ' gs.GetPhase() End If gs_itr.Next() End While End Sub Sub ProcessText(ByRef page_reader As ElementReader) ' Begin text element Console.WriteLine("Begin Text Block:") Dim element As Element element = page_reader.Next() While Not IsNothing(element) If element.GetType() = element.Type.e_text_end Then ' Finish the text block Console.WriteLine("End Text Block.") Return ElseIf element.GetType() = element.Type.e_text Then Dim gs As GState = element.GetGState() Dim cs_fill As ColorSpace = gs.GetFillColorSpace() Dim fill As ColorPt = gs.GetFillColor() Dim outc As ColorPt = New ColorPt cs_fill.Convert2RGB(fill, outc) Dim cs_stroke As ColorSpace = gs.GetStrokeColorSpace() Dim stroke As ColorPt = gs.GetStrokeColor() Dim font As Font = gs.GetFont() Console.Write("Font Name: ") Console.Write(font.GetName()) ' font.IsFixedWidth() ' font.IsSerif() ' font.IsSymbolic() ' font.IsItalic() ' ... ' Dim word_spacing As Double = gs.GetWordSpacing() ' Dim char_spacing As Double = gs.GetCharSpacing() ' Use element.GetCTM() if you are interested in the CTM ' (current transformation matrix). Dim ctm As Matrix2D = element.GetCTM() Dim text_mtx As Matrix2D = element.GetTextMatrix() Dim mtx As Matrix2D = New Matrix2D mtx.Set(ctm) mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v) Dim font_sz_scale_factor As Double = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d) Dim font_size As Double = gs.GetFontSize() Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size) Dim font_color As ColorPt = gs.GetFillColor() Dim cs As ColorSpace = gs.GetFillColorSpace() Dim rgb As ColorPt = New ColorPt cs.Convert2RGB(font_color, rgb) Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", _ CByte(rgb.Get(0) * 255), CByte(rgb.Get(1) * 255), CByte(rgb.Get(2) * 255)) Dim x, y As Double Dim char_code As Integer Dim itr As CharIterator = element.GetCharIterator() While itr.HasNext() Console.Write("Character code: ") char_code = itr.Current().char_code Console.Write(Chr(char_code)) x = itr.Current().x ' character positioning information y = itr.Current().y ' To get the exact character positioning information you need to ' concatenate current text matrix with CTM and then multiply ' relative positioning coordinates with the resulting matrix. ' mtx.Set(ctm) mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v) mtx.Mult(x, y) Console.WriteLine(" Position: x={0:f} y={1:f}", x, y) itr.Next() End While Console.WriteLine() End If element = page_reader.Next() End While End Sub Sub ProcessImage(ByRef image As Element) Dim image_mask As Boolean = image.IsImageMask() Dim interpolate As Boolean = image.IsImageInterpolate() Dim width As Integer = image.GetImageWidth() Dim height As Integer = image.GetImageHeight() Dim out_data_sz As Integer = width * height * 3 Console.WriteLine("Image: width=""{0:d}"" height=""{1:d}""", width, height) ' Dim mtx As Matrix2D = image.GetCTM() ' image matrix (page positioning info) ' You can use GetImageData to read the raw (decoded) image data 'image.GetBitsPerComponent() 'image.GetImageData() ' get raw image data ' .... or use Image2RGB filter that converts every image to RGB format, ' This should save you time since you don't need to deal with color conversions, ' image up-sampling, decoding etc. Dim img_conv As Image2RGB = New Image2RGB(image) ' Extract and convert image to RGB 8-bpc format Dim reader As FilterReader = New FilterReader(img_conv) ' A buffer used to keep image data. Dim image_data_out As Byte() = Nothing '= New Byte(out_data_sz) reader.Read(image_data_out) ' image_data_out contains RGB image data. ' Note that you don't need to read a whole image at a time. Alternatively ' you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) ' until the function returns 0. End Sub Sub ProcessElements(ByRef reader As ElementReader) Dim element As Element = reader.Next() element = reader.Next() While Not IsNothing(element) ' Read page contents If element.GetType() = element.Type.e_path Then ' Process path data... ProcessPath(reader, element) ElseIf element.GetType() = element.Type.e_text_begin Then ' Process text strings... ProcessText(reader) ElseIf element.GetType() = element.Type.e_form Then ' Process form XObjects reader.FormBegin() ProcessElements(reader) reader.End() ElseIf element.GetType() = element.Type.e_image Then ' Process Images ProcessImage(element) End If element = reader.Next() End While End Sub Sub Main() PDFNet.Initialize() ' Relative path to the folder containing test files. Dim input_path As String = "../../../TestFiles/" ' Dim output_path As String = "../../../TestFiles/Output/" Console.WriteLine("-------------------------------------------------") Console.WriteLine("Extract page element information from all") Console.WriteLine("pages in the document.") ' Open the test file Console.WriteLine("Opening the input file...") Dim doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf") doc.InitSecurityHandler() Dim pgnum As Integer = doc.GetPageCount() Dim itr As PageIterator Dim page_reader As ElementReader = New ElementReader itr = doc.GetPageIterator() While itr.HasNext() ' Read every page Console.WriteLine("Page {0:d} ----------------------------------------", _ itr.GetPageNumber()) Dim crop_box As Rect = itr.Current().GetCropBox() Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2) Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height()) page_reader.Begin(itr.Current()) ProcessElements(page_reader) page_reader.End() itr.Next() End While ' Calling Dispose() on ElementReader/Writer/Builder can result in increased performance and lower memory consumption. page_reader.Dispose() doc.Close() Console.WriteLine("Done.") PDFNet.Terminate() End Sub End Module