Some test text!

Reading page content

Page content is represented as a sequence of graphical Elements such as paths, text, images, and forms. The only effect of the ordering of Elements in the display list is the order in which Elements are painted. Elements that occur later in the display list can obscure earlier elements.

A display list can be traversed using an ElementReader object. For example:

void ReadDoc()
{
  // Open an existing document
  PDFDoc doc = new PDFDoc("in.pdf");
  doc.InitSecurityHandler();

  ElementReader reader = new ElementReader();

  //  Read page content on every page in the document
  PageIterator itr;
  PageIterator end = doc.PageEnd();
  for (itr=doc.PageBegin(); itr!=end; itr.Next())
  {
    // Read the page
    reader.Begin(itr.Current());
    ProcessElements(reader);
  }
}

void ProcessElements(ElementReader reader)
{
  Element element;

  // Traverse the page display list
  while ((element = reader.Next()) != null)
  {
    switch (element.GetType())
    {
      case Element.ElementType.e_path:
      {
        if (element.IsClippingPath())
        {}
        // ...
        break;
      }
      case Element.ElementType.e_text:
      {
        Matrix2D text_mtx = element.GetTextMatrix();
        // ...
        break;
      }
      case Element.ElementType.e_form:
      {
        reader.FormBegin();
        ProcessElements(reader);
        reader.End();
        break;
      }
    }
  }
}

To start traversing the display list, call reader.Begin(). Then, reader.Next() will return subsequent Elements until null is returned (marking the end of the display list).

Note that, while ElementReader only works with one page at a time, the same ElementReader object may be reused to process multiple pages.

Get the answers you need: Support