TET Cookbook

cookbook

tetml/tetml

Extract text from PDF document as XML.

Download Java Code  Show Output  Show Input (TET-datasheet.pdf) 

/*
 * Extract text from PDF document as TETML. If constant INMEMORY is false, write
 * the TETML to the output file. Otherwise fetch the TETML in memory, parse it and
 * print some information to System.out.
 * 
 * Required software: TET 5
 * 
 * Required data: PDF document
 * 
 */

package com.pdflib.cookbook.tet.tetml;

import java.io.ByteArrayInputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;

import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import com.pdflib.TET;
import com.pdflib.TETException;

public class tetml {
    /*
     * Global option list.
     */
    static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}";

    /*
     * Document specific option list.
     */
    static final String BASE_DOC_OPTLIST = "";

    /*
     * Page-specific option list.
     */
    static final String PAGE_OPTLIST = "granularity=word";

    /*
     * The encoding in which the output is sent to System.out. For running the
     * example in a Windows command window, you can set this for example to
     * "windows-1252" for getting Latin-1 output.
     */
    private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");

    /*
     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.
     */
    private static PrintStream out;

    /*
     * Set to true for in-memory processing.
     */
    private static final boolean INMEMORY = true;

    /*
     * Word counter for in-memory processing code.
     */
    int word_count = 0;

    /*
     * SAX handler class to count the words in the document.
     */
    private class sax_handler extends DefaultHandler {
        public void startElement(String uri, String local_name, String qualified_name, Attributes attributes)
                throws SAXException {
            if (local_name.equals("Word")) {
                word_count += 1;
            } else if (local_name.equals("Font")) {
                out.println("Font " + attributes.getValue("", "name") + " (" + attributes.getValue("", "type") + ")");
            }
        }
    }

    public static void main(String[] args) throws UnsupportedEncodingException {
        System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
        out = new PrintStream(System.out, true, OUTPUT_ENCODING);

        if (args.length != 1) {
            System.err.println("usage: tetml <pdffilename>");
            return;
        }

        /*
         * We need a tetml object; otherwise it's not possible to set up the handler for
         * the SAX parser with the local sax_handler class.
         */
        tetml t = new tetml();
        t.process_xml(args);
    }

    private void process_xml(String[] args) {
        TET tet = null;
        try {
            tet = new TET();
            tet.set_option(GLOBAL_OPTLIST);

            final String outputfilename = args[0] + ".tetml";

            final String docoptlist = (INMEMORY ? "tetml={}" : "tetml={filename={" + outputfilename + "}}") + " "
                    + BASE_DOC_OPTLIST;

            if (INMEMORY) {
                out.println("Processing TETML output for document \"" + args[0] + "\" in memory...");
            } else {
                out.println("Extracting TETML for document \"" + args[0] + "\" to file \"" + outputfilename + "\"...");
            }

            final int doc = tet.open_document(args[0], docoptlist);
            if (doc == -1) {
                System.err
                        .println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
                tet.delete();
                return;
            }

            final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

            /*
             * Loop over pages in the document;
             */
            for (int pageno = 0; pageno <= n_pages; ++pageno) {
                tet.process_page(doc, pageno, PAGE_OPTLIST);
            }

            /*
             * This could be combined with the last page-related call.
             */
            tet.process_page(doc, 0, "tetml={trailer}");

            if (INMEMORY) {
                /*
                 * Get the TETML document as a byte array.
                 */
                final byte[] tetml = tet.get_tetml(doc, "");

                if (tetml == null) {
                    System.err.println("tetml: couldn't retrieve TETML data");
                    return;
                }

                /*
                 * Process the in-memory TETML document to print out some information that is
                 * extracted with the sax_handler class.
                 */

                SAXParserFactory spf = SAXParserFactory.newInstance();
                spf.setNamespaceAware(true);
                SAXParser saxParser = spf.newSAXParser();
                XMLReader reader = saxParser.getXMLReader();
                reader.setContentHandler(new sax_handler());
                reader.parse(new InputSource(new ByteArrayInputStream(tetml)));
                System.out.println("Found " + word_count + " words in document");
            }

            tet.close_document(doc);
        } catch (TETException e) {
            System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg());
            System.exit(1);
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        } finally {
            if (tet != null) {
                tet.delete();
            }
        }
    }
}