#!/usr/bin/perl
# Extract text from PDF and filter according to font name and size.
# This can be used to identify headings in the document and create a
# table of contents.

use PDFlib::TET;
use strict;

# global option list */
my $globaloptlist = "searchpath={{../data} {../../../resource/cmap}}";

# document-specific option list */
my $docoptlist = "";

# page-specific option list */
my $pageoptlist = "granularity=line";

# Search text with at least this size (use 0 to catch all sizes) */
my $fontsizetrigger = 10;

# Catch text where the font name contains this string
# (use empty string to catch all font names)
#/
my $fontnametrigger = "Bold";
my $pageno = 0;

eval  {
    my $tet;
    my $n_pages;
    my $doc;

    if ($#ARGV != 0) {
        die("usage: fontfilter.pl <infilename>\n");
    }

    $tet = new PDFlib::TET;

    $tet->set_option($globaloptlist);

    $doc = $tet->open_document($ARGV[0], $docoptlist);

    if ($doc == -1) {
        die(sprintf("Error %d in %s(): %s\n",
            $tet->get_errnum(), $tet->get_apiname(), $tet->get_errmsg()));
    }

    # get number of pages in the document */
    $n_pages = $tet->pcos_get_number($doc, "length:pages");

    # loop over pages in the document */
    for ($pageno = 1; $pageno <= $n_pages; ++$pageno) {
        my $text;
        my $page;

        $page = $tet->open_page($doc, $pageno, $pageoptlist);

        if ($page == -1) {
            printf("Error %d in %s() on page %d: %s\n",
                $tet->get_errnum(), $tet->get_apiname(), $pageno,
                $tet->get_errmsg());
            next;                        # try next page */
        }

        # Retrieve all text fragments for the page */
        while (defined($text = $tet->get_text($page))) {
            my $ci;
            my $fontname;

            # Loop over all characters */
            while ($ci = $tet->get_char_info($page)) {
                # We need only the font name and size; the text 
                # position could be fetched from ci->x and ci->y.
                #/
                $fontname = $tet->pcos_get_string($doc,
                            "fonts[" . $ci->{"fontid"} . "]/name");

                # Check whether we found a match */
                if ($ci->{"fontsize"} >= $fontsizetrigger &&
                        ($fontname =~ m/$fontnametrigger/)) {
                    # print the retrieved font name, size, and text */
                    printf("[%s %.2f] %s\n",
                        $fontname, $ci->{"fontsize"}, $text);
                }

                # In this sample we check only the first character of
                # each fragment.
                #/
                last;
            }
        }

        if ($tet->get_errnum() != 0) {
            printf("Error %d in %s() on page %d: %s\n",
                $tet->get_errnum(), $tet->get_apiname(), $pageno,
                $tet->get_errmsg());
        }

        $tet->close_page($page);
    }

    $tet->close_document($doc);
};

if ($@) {
    printf("TET Exception occurred:\n");
    if ($pageno == 0) {
        printf("Error $@\n");
    } else {
        printf("Error $@ on page $pageno\n");
    }
    exit(1);
}
