Some test text!

Search
Hamburger Icon

PDF data extraction in C# (images, text, paths)

More languages

More languages
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
C# (UWP)
VB
C# (Xamarin)

Sample C# code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our C# PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------
// A sample project illustrating some extraction capabilities of ElementReader
// in more detail
//---------------------------------------------------------------------------------------

using System;
using pdftron;
using pdftron.Common;
using pdftron.Filters;
using pdftron.SDF;
using pdftron.PDF;

using NUnit.Framework;

namespace MiscellaneousSamples
{
	/// <summary>
	/// Summary description for Class1.
	/// </summary>
	[TestFixture]
	public class ElementReaderAdvTest
	{
		
		// Relative path to the folder containing test files.
		const string input_path =  "TestFiles/";

		static string m_buf;

		static public void ProcessPath(ElementReader reader, Element path)
		{
			if (path.IsClippingPath())
			{
				Console.WriteLine("This is a clipping path");
			}

            PathData pathData = path.GetPathData();
			double[] data = pathData.points;
			int data_sz = data.Length;

            byte[] opr = pathData.operators;
			int opr_sz = opr.Length;

			int opr_itr = 0, opr_end = opr_sz;
			int data_itr = 0, data_end = data_sz;
			double x1, y1, x2, y2, x3, y3;

			// Use path.GetCTM() if you are interested in CTM (current transformation matrix).

			Console.Write(" Path Data Points := \"");
			for ( ; opr_itr < opr_end; ++opr_itr)
			{
				switch((PathData.PathSegmentType)((int)opr[opr_itr]))
				{
                    case PathData.PathSegmentType.e_moveto:
						x1 = data[data_itr]; ++data_itr;
						y1 = data[data_itr]; ++data_itr;
						m_buf = string.Format("M{0:n0} {1:n0}", x1, y1);
						Console.Write(m_buf);
						break;
                    case PathData.PathSegmentType.e_lineto:
						x1 = data[data_itr]; ++data_itr;
						y1 = data[data_itr]; ++data_itr;
						m_buf = string.Format(" L{0:n0} {1:n0}", x1, y1);
						Console.Write(m_buf);
						break;
                    case PathData.PathSegmentType.e_cubicto:
						x1 = data[data_itr]; ++data_itr;
						y1 = data[data_itr]; ++data_itr;
						x2 = data[data_itr]; ++data_itr;
						y2 = data[data_itr]; ++data_itr;
						x3 = data[data_itr]; ++data_itr;
						y3 = data[data_itr]; ++data_itr;
						m_buf = string.Format(" C{0:n0} {1:n0} {2:n0} {3:n0} {4:n0} {5:n0}",
							new object[] {x1, y1, x2, y2, x3, y3});
						Console.Write(m_buf);
						break;
                    case PathData.PathSegmentType.e_rect:
					{
						x1 = data[data_itr]; ++data_itr;
						y1 = data[data_itr]; ++data_itr;
						double w = data[data_itr]; ++data_itr;
						double h = data[data_itr]; ++data_itr;
						x2 = x1 + w;
						y2 = y1;
						x3 = x2;
						y3 = y1 + h;
						double x4 = x1; 
						double y4 = y3;
						m_buf = string.Format("M{0:n0} {1:n0} L{2:n0} {3:n0} L{4:n0} {5:n0} L{6:n0} {7:n0} Z",
							new object[] {x1, y1, x2, y2, x3, y3, x4, y4});
						Console.Write(m_buf);
						break;
					}
                    case PathData.PathSegmentType.e_closepath:
						Console.WriteLine(" Close Path");
						break;
					default: 
						System.Diagnostics.Debug.Assert(false);
						break;
				}	
			}

			Console.Write("\" ");

			GState gs = path.GetGState();

			// Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
			if (path.IsStroked()) 
			{
				Console.WriteLine("Stroke path"); 

				if (gs.GetStrokeColorSpace().GetType() == ColorSpace.Type.e_pattern)
				{
					Console.WriteLine("Path has associated pattern"); 
				}
				else
				{
					// Get stroke color (you can use PDFNet color conversion facilities)
					// ColorPt rgb = new ColorPt();
					// gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
				}
			}
			else 
			{
				// Do not stroke path
			}

			if (path.IsFilled())
			{
				Console.WriteLine("Fill path"); 

				if (gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern)
				{		
					Console.WriteLine("Path has associated pattern"); 
				}
				else
				{
					// ColorPt rgb = new ColorPt();
					// gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
				}        
			}
			else 
			{
				// Do not fill path
			}

			// Process any changes in graphics state  ---------------------------------

			GSChangesIterator gs_itr = reader.GetChangesIterator();
			for ( ; gs_itr.HasNext(); gs_itr.Next()) 
			{
				switch(gs_itr.Current())
				{
					case GState.GStateAttribute.e_transform :
						// Get transform matrix for this element. Unlike path.GetCTM() 
						// that return full transformation matrix gs.GetTransform() return 
						// only the transformation matrix that was installed for this element.
						//
						// gs.GetTransform();
						break;
					case GState.GStateAttribute.e_line_width :
						// gs.GetLineWidth();
						break;
					case GState.GStateAttribute.e_line_cap :
						// gs.GetLineCap();
						break;
					case GState.GStateAttribute.e_line_join :
						// gs.GetLineJoin();
						break;
					case GState.GStateAttribute.e_flatness :	
						break;
					case GState.GStateAttribute.e_miter_limit :
						// gs.GetMiterLimit();
						break;
					case GState.GStateAttribute.e_dash_pattern :
					{
						// double[] dashes;
						// gs.GetDashes(dashes);
						// gs.GetPhase()
						break;
					}
					case GState.GStateAttribute.e_fill_color:
					{
						if ( gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern &&
							 gs.GetFillPattern().GetType() != PatternColor.Type.e_shading)
						{	
							//process the pattern data
							reader.PatternBegin(true);
							ProcessElements(reader);
							reader.End();
						}
						break;
					}
				}
			}
			reader.ClearChangeList();
		}

		static public void ProcessText(ElementReader page_reader) 
		{
			// Begin text element
			Console.WriteLine("Begin Text Block:");

			Element element; 
			while ((element = page_reader.Next()) != null) 
			{
				switch (element.GetType())
				{
					case Element.Type.e_text_end: 
						// Finish the text block
						Console.WriteLine("End Text Block.");
						return;

					case Element.Type.e_text:
					{
						GState gs = element.GetGState();

						ColorSpace cs_fill = gs.GetFillColorSpace();
						ColorPt fill = gs.GetFillColor();

						ColorPt outc = new ColorPt();
						cs_fill.Convert2RGB(fill, outc);


						ColorSpace cs_stroke = gs.GetStrokeColorSpace();
						ColorPt stroke = gs.GetStrokeColor();

						Font font = gs.GetFont();

						Console.Write("Font Name: ");
						Console.WriteLine(font.GetName());
						// font.IsFixedWidth();
						// font.IsSerif();
						// font.IsSymbolic();
						// font.IsItalic();
						// ... 

						// double word_spacing = gs.GetWordSpacing();
						// double char_spacing = gs.GetCharSpacing();

						// Use element.GetCTM() if you are interested in the CTM 
						// (current transformation matrix).
						if (font.GetType() == Font.Type.e_Type3)
						{
							//type 3 font, process its data
							for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) 
							{
								page_reader.Type3FontBegin(itr.Current());
								ProcessElements(page_reader);
								page_reader.End();
							}
						}

						else
						{

							Matrix2D ctm = element.GetCTM();

							Matrix2D text_mtx = element.GetTextMatrix();

                            /*
                            Matrix2D mtx = ctm * text_mtx;
                            double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d);
                            double font_size = gs.GetFontSize();
                            Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size);

                            ColorPt font_color = gs.GetFillColor();
                            ColorSpace cs = gs.GetFillColorSpace();

                            ColorPt rgb = new ColorPt();
                            cs.Convert2RGB(font_color, rgb);
                            Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255),
                            (byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255));
                                

                            Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", 
							(byte)(rgb.Get(0)*255),
							(byte)(rgb.Get(1)*255),
							(byte)(rgb.Get(2)*255));
                            */

                            double x, y;
							int char_code; 
											
							for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) 
							{
								Console.Write("Character code: ");
								char_code = itr.Current().char_code;
                                if (char_code >= 32 || char_code <= 127)
                                { 
                                    // Print if in ASCII range...
                                    Console.Write((char)char_code);
                                }

								x = itr.Current().x;		// character positioning information
								y = itr.Current().y;

								// To get the exact character positioning information you need to 
								// concatenate current text matrix with CTM and then multiply 
								// relative positioning coordinates with the resulting matrix.
								//
								Matrix2D mtx2 = ctm * text_mtx;
								mtx2.Mult(ref x, ref y);
								// Console.WriteLine(" Position: x={0:f} y={1:f}", x, y);
							}
						}

						Console.WriteLine();
						break;
					}
				}
			}
		}

		static int image_counter = 0;

		static public void ProcessImage(Element image)  
		{
			bool image_mask = image.IsImageMask();
			bool interpolate = image.IsImageInterpolate();
			int width = image.GetImageWidth();
			int height = image.GetImageHeight();
			int out_data_sz = width * height * 3;

			Console.WriteLine("Image: width=\"{0:d}\" height=\"{1:d}\"", width, height);

			// Matrix2D mtx = image.GetCTM(); // image matrix (page positioning info)

// 			++image_counter;
// 			System.Drawing.Bitmap bmp = image.GetBitmap();
// 			bmp.Save(Utils.CreateExternalFile("reader_img_extract_") + image_counter.ToString() + ".png", System.Drawing.Imaging.ImageFormat.Png);
// 
			// Alternatively you can use GetImageData to read the raw (decoded) image data
			// image.GetBitsPerComponent();	
			// image.GetImageData();	// get raw image data
			// another approach is to use Image2RGB filter that converts every image to 
			// RGB format. This could save you time since you don't need to deal with color 
			// conversions, image up-sampling, decoding etc.
			// ----------------
			   Image2RGB img_conv = new Image2RGB(image);	// Extract and convert image to RGB 8-bpc format
			   FilterReader reader = new FilterReader(img_conv);			//   
			   byte[] image_data_out = new byte[out_data_sz];  // A buffer used to keep image data.
			   reader.Read(image_data_out);  // image_data_out contains RGB image data.
			// ----------------
			// Note that you don't need to read a whole image at a time. Alternatively
			// you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
			// until the function returns 0. 
		}

	static void ProcessElements(ElementReader reader) 
	{
		Element element;

		while ((element = reader.Next()) != null)  // Read page contents
		{
			switch (element.GetType())
			{
				case Element.Type.e_path:          // Process path data...
				{
					ProcessPath(reader, element);
					break; 
				}
				case Element.Type.e_text_begin:    // Process text strings...
				{
					ProcessText(reader);
					break;
				}
				case Element.Type.e_form:          // Process form XObjects
				{
					reader.FormBegin(); 
					ProcessElements(reader);
					reader.End(); 
					break; 
				}
				case Element.Type.e_image:         // Process Images
				{
					ProcessImage(element);
					break; 
				}	
			}
		}
	}

		/// <summary>
		/// The main entry point for the application.
		/// </summary>
		[Test]
		public static void Sample()
		{
			try
			{

				Console.WriteLine("-------------------------------------------------");
				Console.WriteLine("Extract page element information from all ");
				Console.WriteLine("pages in the document.");

				// Open the test file
				using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "newsletter.pdf")))
				{
					doc.InitSecurityHandler();

					int pgnum = doc.GetPageCount();
					PageIterator itr;

					using (ElementReader page_reader = new ElementReader())
					{
						for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())		//  Read every page
						{				
							Console.WriteLine("Page {0:d}----------------------------------------",
								itr.GetPageNumber());

							Rect crop_box = itr.Current().GetCropBox();
							crop_box.Normalize();

							// Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2);
							// Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height());

							page_reader.Begin(itr.Current());
							ProcessElements(page_reader);
							page_reader.End(); 
						}
					}

					Console.WriteLine("Done.");
				}
			}
			catch (PDFNetException e)
			{
				Console.WriteLine(e.Message);
				Assert.True(false);
			}
		}
	}
}