#!/usr/bin/env ruby
#
# PDF image extractor based on PDFlib TET
#

require 'TET'


# global option list
globaloptlist = "searchpath={{../data}}"

# document-specific option list
docoptlist = ""

# Page-specific option list, e.g.
# "imageanalysis={merge={gap=1}}
pageoptlist = ""

pageno = 0

# Print the following information for each image:
# - pCOS id (required for indexing the images[] array)
# - pixel size of the underlying PDF image XObject
# - number of components, bits per component, and colorspace
# - mergetype if different from "normal", i.e. "artificial"
#   (=merged) or "consumed"
# - "stencilmask" property, i.e. /ImageMask in PDF
# - pCOS id of mask image, i.e. /Mask or /SMask in PDF

def report_image_info(tet, doc, imageid)
    width = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/Width")
    height = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/Height")
    bpc = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/bpc")
    cs = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/colorspaceid")
    components = tet.pcos_get_number(doc, "colorspaces[" + cs.to_i.to_s + "]/components")
    csname = tet.pcos_get_string(doc, "colorspaces[" + cs.to_i.to_s + "]/name");

    printf("image %d: %dx%d pixel, ", imageid, width, height)
    printf("%dx%d bit %s", components, bpc, csname)

    if (csname == "Indexed")
      basecs = tet.pcos_get_number(doc, "colorspaces[" + cs.to_i.to_s + "]/baseid")
      basecsname = tet.pcos_get_string(doc, "colorspaces[" + basecs.to_i.to_s + "]/name");
      printf(" %s", basecsname);
    end

    # Check whether this image has been created by merging smaller images
    mergetype = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/mergetype")
    if (mergetype == 1)
      print(", mergetype=artificial")
    end

    stencilmask = tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/stencilmask")
    if (stencilmask == 1)
      print(", used as stencil mask");
    end

    # Check whether the image has an attached mask 
    maskid =  tet.pcos_get_number(doc, "images[" + imageid.to_s + "]/maskid");
    if (maskid != -1)
      printf(", masked with image %s", maskid.to_s)
    end

    print("\n");
    return 0
end

begin
    if (ARGV.length != 1)
      raise "usage: images_per_page.rb <infilename>"
    end

    tet = TET.new
    
    outfilebase = ARGV[0]
    # strip .pdf suffix if present 

    if (outfilebase[-4,4].casecmp ".pdf")
      outfilebase = outfilebase[0..-5]
    end

    tet.set_option(globaloptlist)

    doc = tet.open_document(ARGV[0], docoptlist)

    if (doc == -1)
      raise "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname() \
        + "(): " + tet.get_errmsg()
    end

    #  Get number of pages in the document
    n_pages = tet.pcos_get_number(doc, "length:pages")

    # Loop over pages and extract images
    1.upto(n_pages) do |pageno|
      imagecount = 0;
      page = tet.open_page(doc, pageno, pageoptlist)

      if (page == -1) 
          print "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname()\
        + "(): " + tet.get_errmsg()
          #next                        # try next page
      end

      # Retrieve all images on th epage
      while (ti = tet.get_image_info(page))
        imagecount += 1
        # Report image details: pixel geometry, color space etc. 
        report_image_info(tet, doc, ti.imageid)
        
        # Report placement geometry
        printf("  placed on page %d at position (%g, %g): %dx%dpt, alpha=%g, beta=%gn", 
          pageno, ti.x, ti.y, ti.width.to_i, ti.height.to_i, ti.alpha, ti.beta,
          (ti.attributes & TET::ATTR_ARTIFACT) ? ", Artifact" : ""
          )
                
        # Write image data to file
        imageoptlist =  " filename {" +
          outfilebase + "_p" + pageno.to_s + "_" + imagecount.to_s + "_I" + ti.imageid.to_s + "}"
        if (tet.write_image_file(doc, ti.imageid, imageoptlist) == -1)
          print "Error " + tet.get_errnum().to_s + " in " +
            tet.get_apiname() + "(): " + tet.get_errmsg()
            next # try next image
        end
        # Check whether the image has a mask attached... 
        maskid = tet.pcos_get_number(doc, "images[" + ti.imageid.to_s + "]/maskid")

        # and retrieve it if present 
        if (maskid != -1)
          print "  masked with " 
          report_image_info(tet, doc, maskid)
        
          # Write image data to file
          imageoptlist =  " filename {" +
            outfilebase + "_p" + pageno.to_s + "_" + imagecount.to_s + "_I" + ti.imageid.to_s + "mask_I" + maskid.to_s + "}"
          if (tet.write_image_file(doc, ti.imageid, imageoptlist) == -1)
            print "Error " + tet.get_errnum().to_s + " in " +
              tet.get_apiname() + "(): " + tet.get_errmsg()
              next # try next image
          end      
        end
        
        if (tet.get_errnum() != 0)
            print "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname()\
          + "(): on page pageno " + tet.get_errmsg()
        end
      end
      tet.close_page(page)
    end

    tet.close_document(doc)

rescue TETException => pe
    print pe.backtrace.join("\n") + "\n"
    printf("Error %d in %s(): %s\n", 
      pe.get_errnum, pe.get_apiname, pe.get_errmsg )
rescue Exception => e
    print e.backtrace.join("\n") + "\n" + e.to_s + "\n"
ensure
    tet.delete() if tet
end
