#!/usr/bin/env ruby
# Extract text from PDF and filter according to font name and size.
# This can be used to identify headings in the document and create a
# table of contents.

require 'TET'

# global option list
globaloptlist = "searchpath={{../data} {../../../resource/cmap}}"

# document-specific option list
docoptlist = ""

# page-specific option list
pageoptlist = "granularity=line"

# Search text with at least this size (use 0 to catch all sizes)
fontsizetrigger = 10

# Catch text where the font name contains this string
# (use empty string to catch all font names)
fontnametrigger = "Bold"
pageno = 0

begin

    if (ARGV.length != 1)
        raise("usage: fontfilter.rb <infilename>\n")
    end

    tet = TET.new

    tet.set_option(globaloptlist)

    doc = tet.open_document(ARGV[0], docoptlist)

    if (doc == -1)
        raise "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname() \
            + "(): " + tet.get_errmsg()
    end

    # get number of pages in the document
    n_pages = tet.pcos_get_number(doc, "length:pages")

    # loop over pages in the document
    1.upto(n_pages) do |pageno|
        page = tet.open_page(doc, pageno, pageoptlist)

        if (page == -1)
            raise "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname()\
                + "(): " + tet.get_errmsg()
            next                        # try next page
        end

        # Retrieve all text fragments for the page
        while (text = tet.get_text(page))

            # Loop over all characters
            while ((ci = tet.get_char_info(page)) != nil)
                # We need only the font name and size the text 
                # position could be fetched from ci.x and ci.y.
                fontname = tet.pcos_get_string(doc, "fonts[" + ci.fontid.to_s + "]/name")

                # Check whether we found a match
                if (ci.fontsize >= fontsizetrigger)
                    # print the retrieved font name, size, and text
                    fontname =~ /#{fontnametrigger}/ \
                        and printf("[%s %.2f] %s\n", fontname, ci.fontsize, text)
                end

                # In this sample we check only the first character of
                # each fragment.
                break
            end
        end

        if (tet.get_errnum() != 0)
            print "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname()\
                + "(): " + tet.get_errmsg()
        end

        tet.close_page(page)
    end

    tet.close_document(doc)

rescue TETException => pe
    print pe.backtrace.join("\n")
    printf("TET exception occurred in dumper:\n")
    print "[" + pe.get_errnum.to_s + "] " + pe.get_apiname + \
                        ": " + pe.get_errmsg + "\n"
rescue Exception => e
    print e.backtrace.join("\n") + "\n" + e.to_s + "\n"
ensure
    tet.delete() if tet
end
