#!/usr/bin/perl
#
# Page-based image extractor based on PDFlib TET
#

use PDFlib::TET;
use strict;

# global option list 
my $globaloptlist = "searchpath={{../data}}";

# document-specific option list 
my $docoptlist = "";

# page-specific option list , e.g. 
# "imageanalysis={merge={gap=1}}"

my $pageoptlist = "";

my $pageno = 0;
eval  {
    my $tet;

    if ($#ARGV != 0) {
        die("usage: images_per_page.pl <infilename>\n");
    }

    $tet = new PDFlib::TET;

    my $n_pages;
    my $doc;
    my $outfilebase;

    $outfilebase = $ARGV[0];
    if (length($outfilebase) > 4 && (lc substr($outfilebase, -4) eq lc ".pdf" )) {
        $outfilebase = substr($outfilebase, 0, -4);
    }

    $tet->set_option($globaloptlist);

    $doc = $tet->open_document($ARGV[0], $docoptlist);

    if ($doc == -1) {
        die("Error ". $tet->get_errnum() . " in " . $tet->get_apiname()
            . "(): " . $tet->get_errmsg() . "\n");
    }

    # Get number of pages in the document 
    $n_pages = $tet->pcos_get_number($doc, "length:pages");

    # Loop over pages and extract images
    for ($pageno = 1; $pageno <= $n_pages; ++$pageno)
    {
        my $page;
        my $len;
        my $imagecount = 0;

        $page = $tet->open_page($doc, $pageno, $pageoptlist);

        if ($page == -1) {
            print("Error ". $tet->get_errnum() ." in ". $tet->get_apiname()
                . "(): " . $tet->get_errmsg() . "\n");
            next;                        # process next page 
        }

        # Retrieve all images on the page 
        while (my $ti = $tet->get_image_info($page) ) {
            $imagecount++;
            # Report image details: pixel geometry, color space, etc.
            report_image_info($tet, $doc, $ti->{imageid});

            # Report placement geometry
            printf("  placed on page %d at position (%g, %g): " .
                "%dx%dpt, alpha=%g, beta=%g%s\n", 
                $pageno, $ti->{x}, $ti->{y}, 
                $ti->{width}, $ti->{height}, $ti->{alpha}, $ti->{beta},
                ($ti->{"attributes"} & PDFlib::TET::ATTR_ARTIFACT) != 0 ?  ", Artifact" : ""
                );

            # Write image data to file
            my $imageoptlist =  " filename {" .
                $outfilebase . "_p" . $pageno . "_" . $imagecount . "_I" . $ti->{imageid} . "}";
            if ($tet->write_image_file($doc, $ti->{imageid}, $imageoptlist) == -1){
                print("Error " . $tet->get_errnum() . " in " .
                    $tet->get_apiname() . "(): " . $tet->get_errmsg() . "\n");
                next;     # process next image
            }

            # Check whether the image has a mask attached
            my $maskid = $tet->pcos_get_number($doc, "images[" . $ti->{imageid} . "]/maskid");
            if ($maskid != -1) {
                print "  masked with ";

                report_image_info($tet, $doc, $maskid);

                my $imageoptlist =  " filename {" .
                    $outfilebase . "_p" . $pageno . "_" . $imagecount . "_I" . $ti->{imageid} . "_mask_I" . $maskid . "}";
                if ($tet->write_image_file($doc, $maskid, $imageoptlist) == -1){
                    print("Error " . $tet->get_errnum() . " in " .
                        $tet->get_apiname() . "() for mask image: " . $tet->get_errmsg() . "\n");
                }

            }

        }

        if ($tet->get_errnum() != 0) {
            print("Error ". $tet->get_errnum() . " in " . 
                    $tet->get_apiname() . "(): on page $pageno" 
                    . $tet->get_errmsg() . "\n");
        }

        $tet->close_page($page);
    }

    $tet->close_document($doc);
};

if ($@) {
    printf("TET Exception occurred:\n");
    if ($pageno == 0) {
        printf("Error $@\n");
    } else {
        printf("Error $@ on page $pageno\n");
    }
    exit(1);
}

# Print the following information for each image:
# - pCOS id (required for indexing the images[] array)
# - pixel size of the underlying PDF Image XObject
# - number of components, bits per component, and colorspace
# - mergetype if different from "normal", i.e. "artificial" (=merged)
#   or "consumed"
# - "stencilmask" property, i.e. /ImageMask in PDF

sub report_image_info{
    (my $tet, my $doc, my $imageid) = ($_[0], $_[1], $_[2]);
    my $width = $tet->pcos_get_number($doc, "images[$imageid]/Width");
    my $height = $tet->pcos_get_number($doc, "images[$imageid]/Height");
    my $bpc = $tet->pcos_get_number($doc, "images[$imageid]/bpc");
    my $cs = $tet->pcos_get_number($doc, "images[$imageid]/colorspaceid");
    my $components = $tet->pcos_get_number($doc, "colorspaces[$cs]/components");

    printf("image I%d: %dx%d pixel, ", $imageid, $width, $height);
    my $csname = $tet->pcos_get_string($doc, "colorspaces[$cs]/name");

    printf("%gx%g bit %s", $components, $bpc, $csname);

    if ($csname eq "Indexed"){
        my $basecs = $tet->pcos_get_number($doc, "colorspaces[$cs]/baseid");
        my $basecsname = $tet->pcos_get_string($doc, "colorspaces[$basecs]/name");
        printf(" %s", $basecsname);
    }

    # Check whether this image has been created by merging smaller images
    my $mergetype = $tet->pcos_get_number($doc, "images[$imageid]/mergetype");
    if ($mergetype == 1){
        print(", mergetype=artificial");
    }


    my $stencilmask = $tet->pcos_get_number($doc, "images[$imageid]/stencilmask");
    if ($stencilmask == 1){
        print(", used as stencil mask");
    }

    printf("\n");

}

