#!/usr/bin/perl
#
# Simple PDF text extractor based on PDFlib TET
#

use PDFlib::TET;
use strict;


# global option list */
my $globaloptlist = "searchpath={{../data} {../../../resource/cmap}}";

# document-specific option list */
my $docoptlist = "";

# page-specific option list */
my $pageoptlist = "granularity=page";

# separator to emit after each chunk of text. This depends on the
# application's needs; for granularity=word a space character may be useful.
#/
my $separator = "\n";


my $pageno = 0;
eval  {
    my $tet;

    if ($#ARGV != 1) {
        die("usage: extractor.pl <infilename> <outfilename>\n");
    }


    $tet = new PDFlib::TET;

    open(OUTFP, "> $ARGV[1]") ||
        die("Couldn't open output file '" . $ARGV[1] . "'\n");
    binmode(OUTFP, ":utf8");

    my $n_pages;
    my $doc;

    $tet->set_option($globaloptlist);

    $doc = $tet->open_document($ARGV[0], $docoptlist);

    if ($doc == -1) {
        die("Error ". $tet->get_errnum() . " in " . $tet->get_apiname()
            . "(): " . $tet->get_errmsg() . "\n");
    }

    # get number of pages in the document */
    $n_pages = $tet->pcos_get_number($doc, "length:pages");

    # loop over pages in the document */
    for ($pageno = 1; $pageno <= $n_pages; ++$pageno)
    {
        my $text;
        my $page;
        my $len;

        $page = $tet->open_page($doc, $pageno, $pageoptlist);

        if ($page == -1) {
            print("Error ". $tet->get_errnum() ." in ". $tet->get_apiname()
                . "(): " . $tet->get_errmsg() . "\n");
            next;                        # try next page */
        }

        # Retrieve all text fragments; This is actually not required
        # for granularity=page, but must be used for other granularities.
        #/
        while (defined($text = $tet->get_text($page)) ) {

            print OUTFP $text;  # print the retrieved text */

            # print a separator between chunks of text */
            print OUTFP $separator;
        }

        if ($tet->get_errnum() != 0) {
            print("Error ". $tet->get_errnum() . " in " . 
                    $tet->get_apiname() . "(): on page $pageno" 
                    . $tet->get_errmsg() . "\n");
        }

        $tet->close_page($page);
    }

    $tet->close_document($doc);
};

if ($@) {
    printf("TET Exception occurred:\n");
    if ($pageno == 0) {
        printf("Error $@\n");
    } else {
        printf("Error $@ on page $pageno\n");
    }
    exit(1);
}
