/**
 * Extract text from PDF document as TETML. If an output filename is specified,
 * write the TETML to the output file. Otherwise fetch the TETML in memory, parse it
 * and print some information to System.out.
 */

import java.io.ByteArrayInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import com.pdflib.TET;
import com.pdflib.TETException;

public class tetml
{
    /**
     * Global option list.
     */
    static final String globaloptlist = "searchpath={{../data} " +
                        "{../../../resource/cmap}}";

    /**
     * Document specific option list.
     */
    static final String basedocoptlist = "";

    /**
     * Page-specific option list.
     * Remove the tetml= option if you don't need font and geometry
     * information 
     */
    static final String pageoptlist = 
            "granularity=word tetml={glyphdetails={all}}";

    /**
     * Word counter for in-memory processing code.
     */
    int word_count = 0;
    
    /**
     * SAX handler class to count the words in the document.
     */
    private class sax_handler extends DefaultHandler
    {
        public void startElement (String uri, String local_name,
            String qualified_name, Attributes attributes) throws SAXException
        {
            if (local_name.equals("Word"))
            {
                word_count += 1;
            }
            else if (local_name.equals("Font"))
            {
                System.out.println("Font " + attributes.getValue("", "name")
                        + " (" + attributes.getValue("", "type") + ")");
            }
        }
    }
    
    public static void main(String[] args)
    {
        if (args.length < 1 || args.length > 2)
        {
            System.err.println("usage: tetml <pdffilename> [ <tetmlfilename> ]");
            return;
        }

        final boolean inmemory = args.length == 1;

        /*
         * We need a tetml object, otherwise it's not possible to set up the
         * handler for the SAX parser with the local sax_handler class.
         */
        tetml t = new tetml();
        t.process_xml(args, inmemory);
    }

    private void process_xml(String[] args, final boolean inmemory)
    {
        TET tet = null;
        try
        {
            tet = new TET();
            tet.set_option(globaloptlist);

            final String docoptlist =
                (inmemory ? "tetml={}" : "tetml={filename={" + args[1] + "}}")
                    + " " + basedocoptlist;

            if (inmemory)
            {
                System.out.println("Processing TETML output for document \""
                        + args[0] + "\" in memory...");
            }
            else
            {
                System.out.println("Extracting TETML for document \""
                        + args[0] + "\" to file \"" + args[1] + "\"...");
            }
            
            final int doc = tet.open_document(args[0], docoptlist);
            if (doc == -1)
            {
                System.err.println("Error " + tet.get_errnum() + " in "
                        + tet.get_apiname() + "(): " + tet.get_errmsg());
                tet.delete();
                return;
            }

            final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

            /*
             * Loop over pages in the document;
             */
            for (int pageno = 1; pageno <= n_pages; ++pageno)
            {
                tet.process_page(doc, pageno, pageoptlist);
            }

            /*
             * This could be combined with the last page-related call.
             */
            tet.process_page(doc, 0, "tetml={trailer}");

            if (inmemory)
            {
                /*
                 * Get the TETML document as a byte array.
                 */
                final byte[] tetml = tet.get_tetml(doc, "");

                if (tetml == null)
                {
                    System.err.println("tetml: couldn't retrieve TETML data");
                    return;
                }
                
                /*
                 * Process the in-memory TETML document to print out some
                 * information that is extracted with the sax_handler class.
                 */

                SAXParserFactory spf = SAXParserFactory.newInstance();
                spf.setNamespaceAware(true);
                SAXParser saxParser = spf.newSAXParser();
                XMLReader reader = saxParser.getXMLReader();
                reader.setContentHandler(new sax_handler());
                reader.parse(new InputSource(new ByteArrayInputStream(tetml)));
                System.out.println("Found " + word_count + " words in document");
            }

            tet.close_document(doc);
        }
        catch (TETException e)
        {
            System.err.println("Error " + e.get_errnum() + " in "
                    + e.get_apiname() + "(): " + e.get_errmsg());
        }
        catch (Exception e)
        {
            System.err.println(e);
        }
        finally
        {
            if (tet != null)
            {
                tet.delete();
            }
        }
    }
}
