Some test text!

platform
search
< Linux samples

PDF Image Extraction in Python

Sample Python code for using PDFTron SDK to extract images from PDF files, along with their positioning information and DPI. Instead of converting PDF images to a Bitmap, you can also extract uncompressed/compressed image data directly using element.GetImageData() (described in the PDF Data Extraction code sample). Learn more about our PDF Parsing & Content Extraction Library.

Step 1: Get your free trial license key, or sign in

Start Trial
Sign in

Step 2: Add the code:

#---------------------------------------------------------------------------------------
# Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
# Consult LICENSE.txt regarding license information.
#---------------------------------------------------------------------------------------

import site
site.addsitedir("../../../PDFNetC/Lib")
import sys
from PDFNetPython import *

#-----------------------------------------------------------------------------------
# This sample illustrates one approach to PDF image extraction 
# using PDFNet.
# 
# Note: Besides direct image export, you can also convert PDF images 
# to GDI+ Bitmap, or extract uncompressed/compressed image data directly 
# using element.GetImageData() (e.g. as illustrated in ElementReaderAdv 
# sample project).
#-----------------------------------------------------------------------------------

image_counter = 0

# Relative path to the folder containing the test files.
input_path = "../../TestFiles/"
output_path = "../../TestFiles/Output/"

def ImageExtract(reader):
    element = reader.Next()
    while element != None:
        if (element.GetType() == Element.e_image or
            element.GetType() == Element.e_inline_image):
            global image_counter
            image_counter =image_counter + 1
            print("--> Image: " + str(image_counter))
            print("Width: " + str(element.GetImageWidth()))
            print("Height: " + str(element.GetImageHeight()))
            print("BPC: " + str(element.GetBitsPerComponent()))
            
            ctm = element.GetCTM()
            x2 = 1
            y2 = 1
            pt = Point(x2, y2)
            point = ctm.Mult(pt)
            print("Coords: x1=" + str(ctm.m_h) + ", y1=" + str(ctm.m_v) + ", x2=" + str(point.x) + ", y2=" + str(point.y))
            
            if element.GetType() == Element.e_image:
                image = Image(element.GetXObject())
                
                fname = "image_extract1_" + str(image_counter)
                
                path = output_path + fname
                image.Export(path)
                
                #path = output_path + fname + ".tif"
                #image.ExportAsTiff(path)
                
                #path = output_path + fname + ".png"
                #image.ExportAsPng(path)
        elif element.GetType() == Element.e_form:
            reader.FormBegin()
            ImageExtract(reader)
            reader.End()            
        element = reader.Next()

def main():
    # Initialize PDFNet
    PDFNet.Initialize()    
    
    # Example 1: 
    # Extract images by traversing the display list for 
    # every page. With this approach it is possible to obtain 
    # image positioning information and DPI.
    
    doc = PDFDoc(input_path + "newsletter.pdf")
    doc.InitSecurityHandler()
    
    reader = ElementReader()
    
    # Read every page
    itr = doc.GetPageIterator()
    while itr.HasNext():
        reader.Begin(itr.Current())
        ImageExtract(reader)
        reader.End()
        itr.Next()

    doc.Close()
    print("Done.")
    
    print("----------------------------------------------------------------")
    
    # Example 2: 
    # Extract images by scanning the low-level document.
    
    doc = PDFDoc(input_path + "newsletter.pdf")
    doc.InitSecurityHandler()
    image_counter= 0
    
    cos_doc = doc.GetSDFDoc()
    num_objs = cos_doc.XRefSize()
    i = 1
    while i < num_objs:
        obj = cos_doc.GetObj(i)
        if(obj is not None and not obj.IsFree() and obj.IsStream()):
            
            # Process only images
            itr = obj.Find("Type")
            
            if not itr.HasNext() or not itr.Value().GetName() == "XObject":
                i = i + 1
                continue
            
            itr = obj.Find("Subtype")
            if not itr.HasNext() or not itr.Value().GetName() == "Image":
                i = i + 1
                continue
            
            image = Image(obj)
            
            image_counter = image_counter + 1
            print("-. Image: " + str(image_counter))
            print("Width: " + str(image.GetImageWidth()))
            print("Height: " + str(image.GetImageHeight()))
            print("BPC: " + str(image.GetBitsPerComponent()))
            
            fname = "image_extract2_" + str(image_counter)
                
            path = output_path + fname
            image.Export(path)
            
            #path = output_path + fname + ".tif"
            #image.ExportAsTiff(path)
            
            #path = output_path + fname + ".png"
            #image.ExportAsPng(path)
        i = i + 1
    doc.Close()
    
if __name__ == '__main__':
    main()