// // Copyright (c) 2001-2012 by PDFTron Systems Inc. All Rights Reserved. // using System; using System.IO; using System.Collections; using W = System.Web.UI; using pdftron; using pdftron.Common; using pdftron.Filters; using pdftron.SDF; using pdftron.PDF; namespace pdftron { /// /// PdfToHtml implements a PDF to HTML converter using PDFNet. /// class PdfToHtml { public void Convert(TextWriter wri, PDFDoc doc) { Convert(wri, doc, -1, 1, ""); } public void Convert(TextWriter wri, PDFDoc doc, int page_number, double zoom, string outpath) { _doc = doc; _zoom = zoom; _out_path = outpath; using (W.HtmlTextWriter html = new W.HtmlTextWriter(wri)) { html.RenderBeginTag(W.HtmlTextWriterTag.Html); html.RenderBeginTag(W.HtmlTextWriterTag.Head); html.AddAttribute("http-equiv", "Content-Type"); html.AddAttribute("content", "text/html"); html.AddAttribute("charset", html.Encoding.WebName); html.RenderBeginTag(W.HtmlTextWriterTag.Meta); html.RenderEndTag(); // html.RenderBeginTag(W.HtmlTextWriterTag.Title); html.Write("PDFTron PdfToHtml Sample"); html.RenderEndTag(); // html.RenderEndTag(); // html.RenderBeginTag(W.HtmlTextWriterTag.Body); ArrayList html_page_list = new ArrayList(); _page_offset = 0; if (page_number == -1) { for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) { _page_height = (int)(itr.Current().GetPageHeight() * _zoom); html_page_list.Add(RenderHTMLBody(itr.Current())); _page_offset += _page_height; } } else { Page page = doc.GetPage(page_number); _page_height = (int)(page.GetPageHeight() * _zoom); html_page_list.Add(RenderHTMLBody(page)); } // Write global CSS style section. html.RenderBeginTag(W.HtmlTextWriterTag.Style); html.AddAttribute("type", "text/css"); foreach (DictionaryEntry style in _style_map) html.Write(".f" + style.Value + "{" + style.Key + "}\n"); html.RenderEndTag(); // foreach (Object page in html_page_list) html.Write(page); html.RenderEndTag(); // html.RenderEndTag(); // wri.Close(); } } private string RenderHTMLBody(Page page) { StringWriter strw = new StringWriter(); using (W.HtmlTextWriter html = new W.HtmlTextWriter(strw, " ")) { using (TextExtractor txt = new TextExtractor()) { // write page size info html.AddAttribute("id", page.GetIndex().ToString()); html.AddStyleAttribute("position", "absolute"); html.AddStyleAttribute("white-space", "nowrap"); html.AddStyleAttribute("color", "#000000"); html.AddStyleAttribute("top", _page_offset + "px"); html.AddStyleAttribute("left", "0px"); html.AddStyleAttribute("width", ((int)(page.GetPageWidth()* _zoom)).ToString() + "px"); html.AddStyleAttribute("height", _page_height.ToString() + "px"); html.RenderBeginTag("div"); txt.Begin(page, null, TextExtractor.ProcessingFlags.e_none); // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text); TextExtractor.Style s, line_style; // For each line on the page... for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { if (line.GetNumWords() == 0) { continue; } // For now, skip rotated lines if (!line.IsSimpleLine()) { continue; } line_style = line.GetStyle(); Rect line_bbox = line.GetBBox(); // AddStyle(html, line_style, line_bbox, page, false); // html.RenderBeginTag("span"); // For each word in the line... bool first_word = true; for (TextExtractor.Word word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { int sz = word.GetStringLen(); if (sz == 0) continue; s = word.GetStyle(); Font f = new Font(s.GetFont()); if (f.GetName().IndexOf("ZapfDingbats") >= 0) // f.IsSymbolic()) // don't output the word if it is using ZapfDingbats? continue; if (true) // s != line_style) // If the word style is different from the parent style, output the a child span with a different style. { //html.RenderEndTag(); // Rect word_bbox = word.GetBBox(); // Make the word box relative to the line. //word_bbox.x1 -= line_bbox.x1; word_bbox.y1 -= line_bbox.y1; //word_bbox.x2 -= line_bbox.x1; word_bbox.y2 -= line_bbox.y1; AddStyle(html, s, word_bbox, page, false); html.RenderBeginTag("span"); // Use each word as a span html.WriteEncodedText(word.GetString()); html.RenderEndTag(); // //AddStyle(html, line_style, line_bbox, page, false); //html.RenderBeginTag("span"); } else { if (first_word) first_word = false; else html.Write(" "); html.WriteEncodedText(word.GetString()); } } // html.RenderEndTag(); // } } if (_draw_back_image) { string img = RenderHTMLBackgroundImage(page); if (img != "") { html.AddAttribute("src", img); html.AddAttribute("alt", "background image"); html.RenderBeginTag("img"); html.RenderEndTag(); // } } html.RenderEndTag(); // return strw.ToString(); } } private string RenderHTMLBackgroundImage(Page page) { _reader = new ElementReader(); _writer = new ElementWriter(); _builder = new ElementBuilder(); Page new_page = _doc.PageCreate(); _writer.Begin(new_page); _reader.Begin(page); ProcessElements(); _writer.End(); _reader.End(); _writer.Dispose(); _reader.Dispose(); _builder.Dispose(); new_page.SetMediaBox(page.GetCropBox()); new_page.SetRotation(page.GetRotation()); string filename = ""; using (PDFDraw draw = new PDFDraw()) { draw.SetDPI(_zoom * 72); filename = String.Format("page{0}.png", page.GetIndex()); draw.Export(new_page, _out_path+filename, "png"); } return filename; } private void AddStyle(W.HtmlTextWriter html, TextExtractor.Style s, Rect bbox, Page page, bool relative) { double xpos = _zoom * bbox.x1, ypos; if (relative) { ypos = _zoom * bbox.y1; } else { ypos = _zoom * (page.GetPageHeight() - bbox.y1 - bbox.Height()); } html.AddStyleAttribute("top", ((int)ypos).ToString() + "px"); html.AddStyleAttribute("left", ((int)xpos).ToString() + "px"); html.AddStyleAttribute("width", ((int)(_zoom * bbox.Width())).ToString() + "px"); string font_class = "position:" + (relative ? "relative" : "absolute"); font_class += ";font-size:" + (_zoom * s.GetFontSize()).ToString("G4"); if (s.GetColor().ToArgb() != System.Drawing.Color.Black.ToArgb()) font_class += ";color:" + System.Drawing.ColorTranslator.ToHtml(s.GetColor()); // Trim away some characters from the font name that are not liked by CSS. string fnt = s.GetFontName(); int idx = fnt.IndexOf('-'); if (idx >= 0) fnt = fnt.Substring(0, idx); fnt += ",ArialUnicode,Arial,Helvetica"; if (s.IsSerif()) fnt += ",sans-serif"; font_class += ";font-family:" + fnt; if (s.IsItalic()) font_class += ";font-style:" + "italic"; font_class += ";font-weight" + s.GetWeight().ToString(); int class_id = 0; if (!_style_map.ContainsKey(font_class)) { class_id = _style_map.Count; _style_map.Add(font_class, class_id); } else { class_id = (int) _style_map[font_class]; } html.AddAttribute("class", "f"+class_id.ToString()); } Hashtable _style_map = new Hashtable(); private void ProcessElements() { Element element; while ((element = _reader.Next()) != null) { switch (element.GetType()) { case Element.Type.e_text: { bool output_as_bitmap = false; Matrix2D mtx = element.GetCTM() * element.GetTextMatrix(); if (mtx.m_b != 0) // rotated or vertically skewed text -> output as text. output_as_bitmap = true; Font f = element.GetGState().GetFont(); if (f.GetName().IndexOf("ZapfDingbats") >= 0) // f.IsSymbolic()) output_as_bitmap = true; if (output_as_bitmap) _writer.WriteElement(element); else continue; } break; case Element.Type.e_form: // Save GState ... Element e = _builder.CreateGroupBegin(); Matrix2D form_mtx = element.GetGState().GetTransform(); Obj m = element.GetXObject().FindObj("Matrix"); if (m != null) form_mtx.Concat(m.GetAt(0).GetNumber(), m.GetAt(1).GetNumber(), m.GetAt(2).GetNumber(), m.GetAt(3).GetNumber(), m.GetAt(4).GetNumber(), m.GetAt(5).GetNumber()); e.GetGState().SetTransform(form_mtx); _writer.WriteElement(e); _reader.FormBegin(); // Output the clipping path for the Form XObject Obj box_obj = element.GetXObject().FindObj("BBox"); if (box_obj != null) { Rect bbox = new Rect(box_obj); Element clip = _builder.CreateRect(bbox.x1, bbox.y1, bbox.Width(), bbox.Height()); clip.SetPathClip(true); clip.SetPathFill(false); clip.SetPathStroke(false); _writer.WriteElement(clip); } ProcessElements(); // Restore the graphics state _writer.WriteElement(_builder.CreateGroupEnd()); _reader.End(); break; default: _writer.WriteElement(element); break; } } } private string _out_path; private double _zoom = 1.0; private ElementWriter _writer = null; private ElementReader _reader = null; private ElementBuilder _builder = null; private PDFDoc _doc = null; private int _page_offset, _page_height; bool _draw_back_image = true; static void Main(string[] args) { PDFNet.Initialize(); try { using (PDFDoc doc = new PDFDoc("../../../../TestFiles/newsletter.pdf")) { doc.InitSecurityHandler(); string output_path = ""; using (TextWriter wri = File.CreateText(output_path + "out.html")) { PdfToHtml pdf2html = new PdfToHtml(); pdf2html.Convert(wri, doc, -1, 96.0 / 72.0, output_path); } } } catch (PDFNetException e) { Console.WriteLine(e.Message); } Console.WriteLine("Done."); } } }