tetml/tetml
Extract text from PDF document as XML.
Download Java Code Show Output Show Input (TET-datasheet.pdf)
/*
* Extract text from PDF document as TETML. If constant INMEMORY is false, write
* the TETML to the output file. Otherwise fetch the TETML in memory, parse it and
* print some information to System.out.
*
* Required software: TET 5
*
* Required data: PDF document
*
*/
package com.pdflib.cookbook.tet.tetml;
import java.io.ByteArrayInputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import com.pdflib.TET;
import com.pdflib.TETException;
public class tetml {
/*
* Global option list.
*/
static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}";
/*
* Document specific option list.
*/
static final String BASE_DOC_OPTLIST = "";
/*
* Page-specific option list.
*/
static final String PAGE_OPTLIST = "granularity=word";
/*
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");
/*
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
/*
* Set to true for in-memory processing.
*/
private static final boolean INMEMORY = true;
/*
* Word counter for in-memory processing code.
*/
int word_count = 0;
/*
* SAX handler class to count the words in the document.
*/
private class sax_handler extends DefaultHandler {
public void startElement(String uri, String local_name, String qualified_name, Attributes attributes)
throws SAXException {
if (local_name.equals("Word")) {
word_count += 1;
} else if (local_name.equals("Font")) {
out.println("Font " + attributes.getValue("", "name") + " (" + attributes.getValue("", "type") + ")");
}
}
}
public static void main(String[] args) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
if (args.length != 1) {
System.err.println("usage: tetml <pdffilename>");
return;
}
/*
* We need a tetml object; otherwise it's not possible to set up the handler for
* the SAX parser with the local sax_handler class.
*/
tetml t = new tetml();
t.process_xml(args);
}
private void process_xml(String[] args) {
TET tet = null;
try {
tet = new TET();
tet.set_option(GLOBAL_OPTLIST);
final String outputfilename = args[0] + ".tetml";
final String docoptlist = (INMEMORY ? "tetml={}" : "tetml={filename={" + outputfilename + "}}") + " "
+ BASE_DOC_OPTLIST;
if (INMEMORY) {
out.println("Processing TETML output for document \"" + args[0] + "\" in memory...");
} else {
out.println("Extracting TETML for document \"" + args[0] + "\" to file \"" + outputfilename + "\"...");
}
final int doc = tet.open_document(args[0], docoptlist);
if (doc == -1) {
System.err
.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
tet.delete();
return;
}
final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
/*
* Loop over pages in the document;
*/
for (int pageno = 0; pageno <= n_pages; ++pageno) {
tet.process_page(doc, pageno, PAGE_OPTLIST);
}
/*
* This could be combined with the last page-related call.
*/
tet.process_page(doc, 0, "tetml={trailer}");
if (INMEMORY) {
/*
* Get the TETML document as a byte array.
*/
final byte[] tetml = tet.get_tetml(doc, "");
if (tetml == null) {
System.err.println("tetml: couldn't retrieve TETML data");
return;
}
/*
* Process the in-memory TETML document to print out some information that is
* extracted with the sax_handler class.
*/
SAXParserFactory spf = SAXParserFactory.newInstance();
spf.setNamespaceAware(true);
SAXParser saxParser = spf.newSAXParser();
XMLReader reader = saxParser.getXMLReader();
reader.setContentHandler(new sax_handler());
reader.parse(new InputSource(new ByteArrayInputStream(tetml)));
System.out.println("Found " + word_count + " words in document");
}
tet.close_document(doc);
} catch (TETException e) {
System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg());
System.exit(1);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
} finally {
if (tet != null) {
tet.delete();
}
}
}
}