#!/usr/bin/env ruby
#
# Resource-based image extractor based on PDFlib TET
#

require 'TET'

# Print the following information for each image:
# - pCOS id (required for indexing the images[] array)
# - pixel size of the underlying PDF image XObject
# - number of components, bits per component, and colorspace
# - mergetype if different from "normal", i.e. "artificial"
#   (=merged) or "consumed"
# - "stencilmask" property, i.e. /ImageMask in PDF
# - pCOS id of mask image, i.e. /Mask or /SMask in PDF

def report_image_info (tet, doc, imageid)
    width = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/Width")
    height = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/Height")
    bpc = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/bpc")
    cs = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/colorspaceid")
    components = tet.pcos_get_number(doc, "colorspaces[" + cs.to_i.to_s + "]/components")

    printf("image I%d: %dx%d pixel, ", imageid, width, height)
    csname = tet.pcos_get_string(doc, "colorspaces[" + cs.to_i.to_s + "]/name");

    printf("%dx%d bit %s", components, bpc, csname)

    if (csname == "Indexed")
        basecs = tet.pcos_get_number(doc, "colorspaces[" + cs.to_i.to_s + "]/baseid")
        basecsname = tet.pcos_get_string(doc, "colorspaces[" + basecs.to_i.to_s + "]/name");
        printf(" %s", basecsname);
    end

    # Check whether this image has been created by merging smaller images
    mergetype = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/mergetype")
    if (mergetype == 1)
        print(", mergetype=artificial")
    end

    stencilmask = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/stencilmask")
    if (stencilmask == 1)
        print(", used as stencil mask");
    end

    # Check whether the image has an attached mask 
    maskid =  tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/maskid");
    if (maskid != -1)
        printf(", masked with image %s", maskid.to_s)
    end

    print("\n");
    return
end


# global option list
globaloptlist = "searchpath={{../data}}"

# document-specific option list
docoptlist = ""

# page-specific option list, e.g
# "imageanalysis={merge={gap=1}}"
pageoptlist = ""

# here you can insert basic image extract options (more below)
baseimageoptlist = ""

# set this to 1 to generate image data in memory

begin
    if (ARGV.length != 1)
        raise("usage: image_resources.rb <filename>\n")
    end

    tet = TET.new

    outfilebase = ARGV[0]
    # strip .pdf suffix if present 

    if (outfilebase[-4,4].casecmp ".pdf")
        outfilebase = outfilebase[0..-5]
    end

    tet.set_option(globaloptlist)

    doc = tet.open_document(ARGV[0], docoptlist)

    if (doc == -1)
        raise "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname() \
            + "(): " + tet.get_errmsg()
    end

    # Get number of pages in the document
    n_pages = tet.pcos_get_number(doc, "length:pages")

    # Loop over pages to trigger image merging
    1.upto(n_pages) do |pageno|
        page = tet.open_page(doc, pageno, pageoptlist)

        if (page == -1)
            print "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname()\
                + "(): " + tet.get_errmsg()
            next                        # process next page
        end

        if (tet.get_errnum() != 0)
             "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname()\
                + "(): " + tet.get_errmsg()
        end

        tet.close_page(page)
    end

    # Get the number of images in the document.
    n_images = tet.pcos_get_number(doc, "length:images")

    # Loop over all image resources 
    0.upto(n_images-1) do |imageid|
        # Skip images which have been consumed by merging
        mergetype = tet.pcos_get_number(doc, \
                            "images[" + imageid.to_s + "]/mergetype")

        if (mergetype == 2)
            next
        end

        # Skip images which have been flagged by the "small image" filter
        if (tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/small") == 1) 
            next
        end

        # Report image details: pixel geometry, color space etc
        report_image_info(tet, doc, imageid)

        # Write image data to file 
        imageoptlist = sprintf("filename={%s_I%d}", outfilebase, imageid)
        if (tet.write_image_file(doc, imageid, imageoptlist) == -1)
            print "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname()\
                + "(): " + tet.get_errmsg()
        end
    end

    tet.close_document(doc)

rescue TETException => pe
    print pe.backtrace.join("\n")
    printf("Error %d in %s(): %s\n", 
        pe.get_errnum, pe.get_apiname, pe.get_errmsg )
rescue Exception => e
    print e.backtrace.join("\n") + "\n" + e.to_s + "\n"
ensure
    tet.delete() if tet
end



