#!/usr/bin/env ruby
#
# Simple PDF text extractor based on PDFlib TET
#

require 'TET'


# global option list
globaloptlist = "searchpath={{../data} {../../../resource/cmap}}"

# document-specific option list
docoptlist = ""

# page-specific option list
pageoptlist = "granularity=page"

# separator to emit after each chunk of text. This depends on the
# application's needs for granularity=word a space character may be useful.
separator = "\n"


pageno = 0

begin
    if (ARGV.length != 2)
        raise "usage: extractor.rb <infilename> <outfilename>"
    end


    tet = TET.new

    outfp = File.new(ARGV[1], "w")

    tet.set_option(globaloptlist)

    doc = tet.open_document(ARGV[0], docoptlist)

    if (doc == -1)
        raise "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname() \
            + "(): " + tet.get_errmsg()
    end

    # get number of pages in the document
    n_pages = tet.pcos_get_number(doc, "length:pages")

    # loop over pages in the document
    1.upto(n_pages) do |pageno|
        page = tet.open_page(doc, pageno, pageoptlist)

        if (page == -1) 
            print "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname()\
                + "(): " + tet.get_errmsg()
            next                        # try next page
        end

        # Retrieve all text fragments This is actually not required
        # for granularity=page, but must be used for other granularities.
        while ((text = tet.get_text(page)) != nil)

            outfp.print text  # print the retrieved text

            # print a separator between chunks of text
            outfp.print separator
        end

        if (tet.get_errnum() != 0)
            print "Error " + tet.get_errnum().to_s + " in " + tet.get_apiname()\
                + "(): on page pageno " + tet.get_errmsg()
        end

        tet.close_page(page)
    end

    tet.close_document(doc)

rescue TETException => pe
    print pe.backtrace.join("\n") + "\n"
    print "Error [" + pe.get_errnum.to_s + "] " + pe.get_apiname + \
                                ": " + pe.get_errmsg
    print " on page pageno" if (pageno != 0)
    print "\n"
rescue Exception => e
    print e.backtrace.join("\n") + "\n" + e.to_s + "\n"
ensure
    tet.delete() if tet
end
