All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TextExtractor.h
Go to the documentation of this file.
1 //---------------------------------------------------------------------------------------
2 // Copyright (c) 2001-2020 by PDFTron Systems Inc. All Rights Reserved.
3 // Consult legal.txt regarding legal and license information.
4 //---------------------------------------------------------------------------------------
5 #ifndef PDFTRON_H_CPPPDFTextExtractor
6 #define PDFTRON_H_CPPPDFTextExtractor
7 
8 #include <PDF/Page.h>
9 #include <PDF/Rect.h>
10 #include <Common/UString.h>
11 #include <C/PDF/TRN_TextExtractor.h>
12 #include <vector>
13 #include<PDF/OCG/Context.h>
14 
15 namespace pdftron {
16  namespace PDF {
17 
18 class Style;
19 class Word;
20 class Line;
21 
107 {
108 public:
112 
116  TextExtractor();
117  ~TextExtractor();
118 
124  {
125  // Disables expanding of ligatures using a predefined mapping.
126  // Default ligatures are: fi, ff, fl, ffi, ffl, ch, cl, ct, ll,
127  // ss, fs, st, oe, OE.
129 
130  // Disables removing duplicated text that is frequently used to
131  // achieve visual effects of drop shadow and fake bold.
133 
134  // Treat punctuation (e.g. full stop, comma, semicolon, etc.) as
135  // word break characters.
137 
138  // Enables removal of text that is obscured by images or
139  // rectangles. Since this option has small performance penalty
140  // on performance of text extraction, by default it is not
141  // enabled.
143 
144  // Enables removing text that uses rendering mode 3 (i.e. invisible text).
145  // Invisible text is usually used in 'PDF Searchable Images' (i.e. scanned
146  // pages with a corresponding OCR text). As a result, invisible text
147  // will be extracted by default.
149 
150  // Enables removal of text that is marked as part of a Watermark layer
152  };
153 
163  void Begin(Page page, const Rect* clip_ptr = 0, UInt32 flags = 0);
164 
175  void SetOCGContext(OCG::Context* ctx);
176 
180  int GetWordCount();
181 
187  void SetRightToLeftLanguage(bool rtl);
192  bool GetRightToLeftLanguage();
206  UString GetAsText(bool dehyphen = true);
207 
208 #ifndef SWIG
209  void GetAsText(UString& out_str, bool dehyphen = true);
210 #endif
211 
217  UString GetTextUnderAnnot(const Annot& annot);
218 
219 #ifndef SWIG
220  void GetTextUnderAnnot(UString& out_str, const Annot& annot);
221 #endif
222 
223 
228  {
229  // Output words as XML elements instead of inline text.
231 
232  // Include bounding box information for each XML element.
233  // The bounding box information will be stored as 'bbox' attribute.
235 
236  // Include font and styling information.
238  };
239 
282  UString GetAsXML(UInt32 xml_output_flags = 0);
283 
284 #ifndef SWIG
285  void GetAsXML(UString& out_xml, UInt32 xml_output_flags = 0);
286 #endif
287 
291  int GetNumLines();
292 
293 
300  Line GetFirstLine();
301 
305  void Destroy();
306 
307  // @cond PRIVATE_DOC
308 private:
309  TRN_TextExtractor mp_extractor;
310 
311  // TextExtractor should not be copied
312  TextExtractor(const TextExtractor& other);
313  TextExtractor& operator= (const TextExtractor&);
314  // @endcond
315 };
316 
322 class Style
323 {
324 public:
325 
332  SDF::Obj GetFont();
333 
338 
347  double GetFontSize();
348 
357  int GetWeight();
358 
363  bool IsItalic();
364 
370  bool IsSerif();
371 
375  std::vector<int> GetColor();
376 
377 #ifndef SWIG
378  void GetColor(UInt8 rgb[3]);
379 #endif
380 
381  bool operator== (const Style& s) const;
382  bool operator!= (const Style& s) const;
383 
384  Style();
385 
386  // @cond PRIVATE_DOC
387  #ifndef SWIGHIDDEN
388  Style(const Style& s);
389  Style(TRN_TextExtractorStyle impl);
390  TRN_TextExtractorStyle mp_style;
391  #endif
392  // @endcond
393 };
394 
400 class Word
401 {
402 public:
406  int GetNumGlyphs();
407 
414  Rect GetBBox();
415 
416 #ifndef SWIG
417  void GetBBox(double out_bbox[4]);
418 #endif
419 
424  std::vector<double> GetQuad();
425 
426 #ifndef SWIG
427  void GetQuad(double out_quad[8]);
428 #endif
429 
435  std::vector<double> GetGlyphQuad(int glyph_idx);
436 
437 #ifndef SWIG
438  void GetGlyphQuad(int glyph_idx, double out_quad[8]);
439 #endif
440 
445  Style GetCharStyle(int char_idx);
446 
450  Style GetStyle();
451 
455  int GetStringLen();
456 
460 #ifdef SWIG
461  UString GetString();
462 #else
463  const Unicode* GetString();
464 #endif
465 
469  Word GetNextWord();
470 
476  int GetCurrentNum();
477 
481  bool IsValid();
482 
483  bool operator== (const Word&) const;
484  bool operator!= (const Word&) const;
485  Word();
486 
487  // @cond PRIVATE_DOC
488  #ifndef SWIGHIDDEN
489  Word(TRN_TextExtractorWord impl);
490  TRN_TextExtractorWord mp_word;
491  #endif
492  // @endcond
493 };
494 
500 class Line {
501 public:
502 
506  int GetNumWords();
507 
512  bool IsSimpleLine();
513 
520 #ifdef SWIG
521  Rect GetBBox();
522 #else
523  const double* GetBBox();
524 #endif
525 
530  std::vector<double> GetQuad();
531 
532 #ifndef SWIG
533  void GetQuad(double out_quad[8]);
534 #endif
535 
540  Word GetFirstWord();
541 
546  Word GetWord(int word_idx);
547 
551  Line GetNextLine();
552 
556  int GetCurrentNum();
557 
561  Style GetStyle();
562 
568  int GetParagraphID();
569 
575  int GetFlowID();
576 
581  bool EndsWithHyphen();
582 
586  bool IsValid();
587 
588  bool operator== (const Line&) const;
589  bool operator!= (const Line&) const;
590  Line();
591 
592  // @cond PRIVATE_DOC
593  #ifndef SWIGHIDDEN
594  Line(TRN_TextExtractorLine impl);
595  TRN_TextExtractorLine mp_line;
596  #endif
597  // @endcond
598 };
599 
600 
601 
602 
603 #include <Impl/TextExtractor.inl>
604 
605  }; // namespace PDF
606 }; // namespace pdftron
607 
608 #endif // PDFTRON_H_CPPPDFTextExtractor
bool operator!=(const Line &) const
bool operator==(const Line &) const
UString GetAsText(bool dehyphen=true)
std::vector< double > GetGlyphQuad(int glyph_idx)
Style GetCharStyle(int char_idx)
const double * GetBBox()
TRN_UInt8 UInt8
Definition: BasicTypes.h:15
void SetOCGContext(OCG::Context *ctx)
pdftron::PDF::Line Line
std::vector< double > GetQuad()
std::vector< double > GetQuad()
pdftron::PDF::Style Style
TRN_Unicode Unicode
Definition: BasicTypes.h:22
TRN_UInt32 UInt32
Definition: BasicTypes.h:13
UString GetTextUnderAnnot(const Annot &annot)
bool operator==(const Style &s) const
bool operator!=(const Word &) const
const Unicode * GetString()
void Begin(Page page, const Rect *clip_ptr=0, UInt32 flags=0)
UString GetAsXML(UInt32 xml_output_flags=0)
std::vector< int > GetColor()
bool operator!=(const Style &s) const
pdftron::PDF::Word Word
void SetRightToLeftLanguage(bool rtl)
Word GetWord(int word_idx)
bool operator==(const Word &) const