#!/usr/bin/perl
#
# Resource-based image extractor based on PDFlib TET
#

use PDFlib::TET;
use strict;


# global option list 
my $globaloptlist = "searchpath={{../data}}";

# document-specific option list 
my $docoptlist = "";

# page-specific option list, e.g
# "imageanalysis={merge={gap=1}}"
my $pageoptlist = "";


eval  {
    my $tet;

    if ($#ARGV < 0 || $#ARGV > 1) {
        die("usage: image_resources.pl <filename>\n");
    }

    $tet = new PDFlib::TET;

    my $outfilebase = $ARGV[0];
    if (length($outfilebase) > 4 && (lc substr($outfilebase, -4) eq lc ".pdf" )) {
        $outfilebase = substr($outfilebase, 0, -4);
    }

    $tet->set_option($globaloptlist);

    my $doc = $tet->open_document($ARGV[0], $docoptlist);

    if ($doc == -1) {
        die("Error ". $tet->get_errnum() . " in " . $tet->get_apiname()
            . "(): " . $tet->get_errmsg() . "\n");
    }

    # Get number of pages in the document 
    my $n_pages = $tet->pcos_get_number($doc, "length:pages");

    # Loop over all pages to trigger image merging
    for (my $pageno = 1; $pageno <= $n_pages; ++$pageno)
    {
        my $page;

        $page = $tet->open_page($doc, $pageno, $pageoptlist);

        if ($page == -1) {
            print("Error ". $tet->get_errnum() ." in ". $tet->get_apiname()
                . "(): " . $tet->get_errmsg() . "\n");
            next;                        # process next page 
        }

        if ($tet->get_errnum() != 0) {
            print("Error ". $tet->get_errnum() . " in " . 
                    $tet->get_apiname() . "(): on page $pageno" 
                    . $tet->get_errmsg() . "\n");
        }

        $tet->close_page($page);
    }

    # Get the number of images in the document

    my $n_images = $tet->pcos_get_number($doc, "length:images");

    # Loop over all image resources 
    for (my $imageid = 0; $imageid < $n_images; ++$imageid)
    {
        # Skip images which have been consumed by merging
        my $mergetype = $tet->pcos_get_number($doc,
                            "images[$imageid]/mergetype");

        if ($mergetype == 2) {
            next;
        }


        # Skip images which have been flagged by the "small image" filter
        if ($tet->pcos_get_number($doc, "images[$imageid]/small")){
            next;
        }
        # report image details: pixel geometry, color space, etc
        report_image_info($tet, $doc, $imageid);
        
        # Write image data to file
        my $imageoptlist = sprintf("filename={%s_I%d}", $outfilebase, $imageid);
        if ($tet->write_image_file($doc, $imageid, $imageoptlist) == -1) {
            print("Error ". $tet->get_errnum() . " in " . 
                    $tet->get_apiname() . "(): "  .
                    $tet->get_errmsg() . "\n");
        }
    }

    $tet->close_document($doc);
};

if ($@) {
    printf("TET Exception occurred:\n");
    printf("Error $@\n");
    exit(1);
}

# Print the following information for each image:
# - pCOS id (required for indexing the images[] array)
# - pixel size of the underlying PDF Image XObject
# - number of components, bits per component, and colorspace
# - mergetype if different from "normal", i.e. "artificial" (=merged)
#   or "consumed"
# - "stencilmask" property, i.e. /ImageMask in PDF
# - pCOS id of mask image, (i.e. /Mask or /SMask in PDF

sub report_image_info{
    (my $tet, my $doc, my $imageid) = ($_[0], $_[1], $_[2]);
    my $width = $tet->pcos_get_number($doc, "images[$imageid]/Width");
    my $height = $tet->pcos_get_number($doc, "images[$imageid]/Height");
    my $bpc = $tet->pcos_get_number($doc, "images[$imageid]/bpc");
    my $cs = $tet->pcos_get_number($doc, "images[$imageid]/colorspaceid");
    my $components = $tet->pcos_get_number($doc, "colorspaces[$cs]/components");

    printf("image I%d: %dx%d pixel, ", $imageid, $width, $height);
    my $csname = $tet->pcos_get_string($doc, "colorspaces[$cs]/name");

    printf("%gx%g bit %s", $components, $bpc, $csname);

    if ($csname eq "Indexed"){
        my $basecs = $tet->pcos_get_number($doc, "colorspaces[$cs]/baseid");
        my $basecsname = $tet->pcos_get_string($doc, "colorspaces[$basecs]/name");
        printf(" %s", $basecsname);
    }

    # Check whether this image has been created by merging smaller images
    my $mergetype = $tet->pcos_get_number($doc, "images[$imageid]/mergetype");
    if ($mergetype == 1){
        print(", mergetype=artificial");
    }


    my $stencilmask = $tet->pcos_get_number($doc, "images[$imageid]/stencilmask");
    if ($stencilmask == 1){
        print(", used as stencil mask");
    }


    # Check whether the image has an attached mask 
    my $maskid = $tet->pcos_get_number($doc, "images[$imageid]/maskid");
    if ($maskid != -1) {
        printf(", masked with image %d", $maskid);
    }

    printf("\n");

}

