TextExtractor.h

Go to the documentation of this file.
00001 //---------------------------------------------------------------------------------------
00002 // Copyright (c) 2001-2008 by PDFTron Systems Inc. All Rights Reserved.
00003 // Consult legal.txt regarding legal and license information.
00004 //---------------------------------------------------------------------------------------
00005 
00006 #ifndef   HPP_CPPTextExtractor
00007 #define   HPP_CPPTextExtractor
00008 
00009 #include <PDF/Page.h>
00010 #include <PDF/Rect.h>
00011 #include <Common/UString.h>
00012 #include <C/PDF/TRN_TextExtractor.h>
00013 
00014 namespace pdftron { 
00015         namespace PDF {
00016 
00101 class TextExtractor 
00102 {
00103 public:
00104 
00108          TextExtractor();
00109          ~TextExtractor();
00110 
00115         enum ProcessingFlags
00116         {
00117                 // Disables expanding of ligatures using a predefined mapping. 
00118                 // Default ligatures are: fi, ff, fl, ffi, ffl, ch, cl, ct, ll, 
00119                 // ss, fs, st, oe, OE. 
00120                 e_no_ligature_exp = 1, 
00121 
00122                 // Disables removing duplicated text that is frequently used to 
00123                 // achieve visual effects of drop shadow and fake bold. 
00124                 e_no_dup_remove = 2,
00125 
00126                 // Treat punctuation (e.g. full stop, comma, semicolon, etc.) as 
00127                 // word break characters. 
00128                 e_punct_break = 4,
00129 
00130                 // Enables removal of text that is obscured by images or 
00131                 // rectangles. Since this option has small performance penalty 
00132                 // on performance of text extraction, by default it is not 
00133                 // enabled.
00134                 e_remove_hidden_text = 8, 
00135 
00136                 // Enables removing text that uses rendering mode 3 (i.e. invisible text).
00137                 // Invisible text is usually used in 'PDF Searchable Images' (i.e. scanned 
00138                 // pages with a corresponding OCR text). As a result, invisible text 
00139                 // will be extracted by default.
00140                 e_no_invisible_text = 16
00141         };
00142 
00152          void Begin(Page page, const Rect* clip_ptr = 0, UInt32 flags = 0);
00153 
00157          int GetWordCount();
00158 
00172          void GetAsText(UString& out_str, bool dehyphen = true);
00173 
00177         enum XMLOutputFlags 
00178         {
00179                 // Output words as XML elements instead of inline text.
00180                 e_words_as_elements = 1, 
00181 
00182                 // Include bounding box information for each XML element. 
00183                 // The bounding box information will be stored as 'bbox' attribute.
00184                 e_output_bbox = 2, 
00185 
00186                 // Include font and styling information.
00187                 e_output_style_info = 4
00188         };
00189 
00232          void GetAsXML(UString& out_xml, UInt32 xml_output_flags = 0);
00233 
00239         class Style 
00240         {
00241         public:
00242 
00249                 SDF::Obj GetFont();
00250 
00254                 UString GetFontName();
00255 
00264                 double GetFontSize();
00265 
00274                 int GetWeight();
00275 
00280                 bool IsItalic();
00281 
00287                 bool IsSerif();
00288 
00292                 void GetColor(UInt8 rgb[3]);
00293 
00294                 bool operator== (const Style& s);
00295                 bool operator!= (const Style& s);
00296 
00297                 Style();
00298 
00300                 Style(const Style& s);
00301                 Style(TRN_TextExtractorStyle impl);
00302                 TRN_TextExtractorStyle mp_style;
00304         };
00305 
00311         class Word 
00312         {
00313         public:
00317                  int GetNumGlyphs();
00318 
00325                 void GetBBox(double out_bbox[4]);
00326 
00331                 void GetQuad(double out_quad[8]);
00332 
00338                 void GetGlyphQuad(int glyph_idx, double out_quad[8]);
00339 
00344                 Style GetCharStyle(int char_idx);
00345 
00349                 Style GetStyle();
00350 
00354                 int GetStringLen();
00355 
00359                 const Unicode* GetString();
00360 
00364                 Word GetNextWord();
00365 
00371                 int GetCurrentNum();
00372 
00376                 bool IsValid();
00377 
00378                 bool operator== (const Word&);
00379                 bool operator!= (const Word&);
00380                 Word();
00381 
00383                 Word(TRN_TextExtractorWord impl);
00384                 TRN_TextExtractorWord mp_word;
00386         };
00387 
00393         class Line {
00394         public: 
00395 
00399                 int GetNumWords();
00400 
00405                 bool IsSimpleLine();
00406 
00413                 const double* GetBBox();
00414 
00419                 void GetQuad(double out_quad[8]);
00420 
00425                 Word GetFirstWord();
00426 
00430                 Word GetWord(int word_idx);
00431 
00435                 Line GetNextLine();
00436 
00440                 int GetCurrentNum();
00441 
00445                 Style GetStyle();
00446 
00452                 int GetParagraphID();
00453 
00459                 int GetFlowID();
00460 
00465                 bool EndsWithHyphen();
00466 
00470                 bool IsValid();
00471 
00472                 bool operator== (const Line&);
00473                 bool operator!= (const Line&);
00474                 Line();
00475 
00477                 Line(TRN_TextExtractorLine impl);
00478                 TRN_TextExtractorLine mp_line;
00480         };
00481 
00485          int GetNumLines();
00486 
00487 
00494          Line GetFirstLine();
00495 
00496 
00497 private:
00498         TRN_TextExtractor mp_extractor;
00499 };
00500 
00501 
00502 
00503 #include <Impl/TextExtractor.inl>
00504 
00505         };      // namespace PDF
00506 };      // namespace pdftron
00507 
00508 #endif

© 2002-2008 PDFTron Systems Inc.