tet_and_pdflib/burst
Split a document into smaller parts based on some page contents.
Download Java Code Show Output Show Output Show Output Show Input (invoices.pdf)
/*
* Split a document into smaller parts based on some page contents. Various
* criteria for the split points could be useful. The splitting could for
* example be done
*
* - after each empty page
* - when certain text appears on the page (e.g. "Address"). The text could be
* visible on the page, or it could serve as a hidden marker (e.g. invisible
* text or text outside the CropBox)
*
* The example below uses the latter approach. The input document "invoices.pdf"
* contains a sequence of invoices. Each invoice has one or more pages. The
* first page contains the recipient's address and the fixed text "INVOICE" at
* known coordinates. Subsequent pages of the same invoice are blank in these
* places.
*
* The goal is to split the input document into multiple output documents based
* on the recipient's country. A real-world benefit of this could be that the
* postage is cheaper if letters are delivered sorted by country. In the same
* spirit, the invoices could be sorted according zu ZIP code, name of the
* addressee, etc.
*
* Required software: TET 5 and PDFlib+PDI 9
*
* Required data: PDF document
*
*/
package com.pdflib.cookbook.tet.tet_and_pdflib;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import com.pdflib.PDFlibException;
import com.pdflib.TET;
import com.pdflib.TETException;
import com.pdflib.pdflib;
class burst {
/*
* Common search path for PDI and TET to find the input document.
*/
private static final String DOC_SEARCH_PATH = "../input";
/*
* Global option list. The program expects the "resource" directory parallel to
* the "java" directory.
*/
private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap ../resource/glyphlist " + DOC_SEARCH_PATH
+ "}";
/*
* Document specific option list.
*/
private static final String DOC_OPTLIST = "";
/*
* Page-specific option list.
*/
private static final String PAGE_OPTLIST = "granularity=page";
/*
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");
/*
* x-position of the lower left corner of the rectangle that contains the text
* for detecting the first page of a sequence.
*/
private static final double START_SEQ_TXT_LLX = 50;
/*
* y-position of the lower left corner of the rectangle that contains the text
* for detecting the first page of a sequence.
*/
private static final double START_SEQ_TXT_LLY = 535;
/*
* x-position of the upper right corner of the rectangle that contains the text
* for detecting the first page of a sequence.
*/
private static final double START_SEQ_TXT_URX = 105;
/*
* y-position of the upper right corner of the rectangle that contains the text
* for detecting the first page of a sequence.
*/
private static final double START_SEQ_TXT_URY = 550;
/*
* Text that must be found in the rectangle defined by START_SEQ_TXT_LLX,
* START_SEQ_TXT_LLY, START_SEQ_TXT_URX, START_SEQ_TXT_URY in order to identify
* a page as the start of a sequence.
*/
private static final String START_SEQ_TXT = "INVOICE";
/*
* x-position of the lower left corner of the rectangle that contains the text
* for the routing criterion.
*/
private static final double CRITERION_TXT_LLX = 50;
/*
* y-position of the lower left corner of the rectangle that contains the text
* for the routing criterion.
*/
private static final double CRITERION_TXT_LLY = 612;
/*
* x-position of the upper right corner of the rectangle that contains the text
* for the routing criterion.
*/
private static final double CRITERION_TXT_URX = 175;
/*
* y-position of the upper right corner of the rectangle that contains the text
* for the routing criterion.
*/
private static final double CRITERION_TXT_URY = 624;
/*
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
/*
* The name of the input file
*/
private String infilename;
/*
* The name of the output file
*/
private String outfileBasename;
/*
* For mapping country names to output files. The key is the country name in
* lowercase, the value is an object that describes the output document.
*/
private Map<String, output_document> outputDocuments = new HashMap<String, output_document>();
/*
* The current pdflib object, used for all pages after the first one.
*/
private output_document currentOutputDocument = null;
/*
* Description of an output document.
*/
private class output_document {
pdflib p;
int pdiHandle;
String filename;
}
/*
* Import the current page from the PDI import document and place it in the
* ouput document.
*
* @param doc The output document
* @param pageno The current page number in the input document
*
* @throws PDFlibException an error occurred in the PDFlib API
*/
private boolean importPdiPage(output_document doc, int pageno) throws PDFlibException {
/*
* The page size will be adjusted later to match the size of the input pages
*/
doc.p.begin_page_ext(10, 10, "");
int pdiPage = doc.p.open_pdi_page(doc.pdiHandle, pageno, "");
if (pdiPage == -1) {
throw new PDFlibException("Error: " + doc.p.get_errmsg());
}
/* Place the input page and adjust the page size */
doc.p.fit_pdi_page(pdiPage, 0, 0, "adjustpage");
doc.p.close_pdi_page(pdiPage);
doc.p.end_page_ext("");
return true;
}
/*
* This routine implements the detection of the first page of a sequence.
*
* @param tet The TET object for the input document
* @param doc The TET handle for the current page
* @param pageNumber The number of the current page
*
* @return true if this is the first page of a sequence, false otherwise
*
* @throws TETException An error occurred in the TET API
*/
private boolean isFirstOfSequence(TET tet, int doc, int pageNumber) throws TETException {
String includeBox = "includebox={{ " + START_SEQ_TXT_LLX + " " + START_SEQ_TXT_LLY + " " + START_SEQ_TXT_URX
+ " " + START_SEQ_TXT_URY + " }}";
int page = tet.open_page(doc, pageNumber, PAGE_OPTLIST + " " + includeBox);
String text = tet.get_text(page);
boolean retval = text != null && text.equals(START_SEQ_TXT);
tet.close_page(page);
return retval;
}
/*
* Fetch the routing criterion from the area of interest.
*
* @param tet The TET object for the input document
* @param doc The TET handle for the input document
* @param pageNumber The number of the current page
*
* @return The String for looking up the output document
*
* @throws TETException An error occurred in the TET API
*/
private String getRoutingCriterion(TET tet, int doc, int pageNumber) throws TETException {
String includeBox = "includebox={{ " + CRITERION_TXT_LLX + " " + CRITERION_TXT_LLY + " " + CRITERION_TXT_URX
+ " " + CRITERION_TXT_URY + " }}";
int page = tet.open_page(doc, pageNumber, PAGE_OPTLIST + " " + includeBox);
String text = tet.get_text(page);
tet.close_page(page);
return text;
}
/*
* Fetch the output document based on the criterion. Create a new output
* document if none exists yet for the criterion.
*
* @param criterion Criterion for identifying the output document
*
* @return The output document for the criterion
*
* @throws PDFlibException An error occurred in the PDFlib API
*/
private output_document fetchOutputDocument(String criterion) throws PDFlibException {
output_document retval = (output_document) outputDocuments.get(criterion);
if (retval == null) {
String outputFilename = outfileBasename + "_"
+ criterion.replaceAll(" ", "_").toLowerCase() + ".pdf";
pdflib p = new pdflib();
p.set_option("searchpath={" + DOC_SEARCH_PATH + "}");
if (p.begin_document(outputFilename, "") == -1) {
throw new PDFlibException("Error: " + p.get_errmsg());
}
/* add document info entries */
p.set_info("Creator", "Burst TET Cookbook Example");
p.set_info("Author", "PDFlib GmbH");
p.set_info("Title", infilename);
p.set_info("Subject", "Invoices for recipient country " + criterion.toString());
int pdiHandle = p.open_pdi_document(infilename, "");
if (pdiHandle == -1) {
throw new PDFlibException("Error: " + p.get_errmsg());
}
retval = new output_document();
retval.p = p;
retval.pdiHandle = pdiHandle;
retval.filename = outputFilename;
outputDocuments.put(criterion, retval);
}
return retval;
}
/*
* Based on some criteria decide to which output document the current page
* should go. First the function identifies whether the page is the start of a
* new sequence or the continuation of a sequence. In the first case the output
* document is looked up in the map of output documents, and created if
* necessary. In the second case the page is simply routed to the current
* document.
*
* @param tet The TET object for the input document
* @param doc The TET handle for the input document
* @param pageNumber The number of the current page
*
* @return The document to which the current page of the input document shall be
* routed to
*
* @throws TETException An error occurred in the TET API
* @throws PDFlibException An error occurred in the PDFlib API
*/
private output_document routePage(TET tet, int doc, int pageNumber) throws TETException, PDFlibException {
if (currentOutputDocument == null || isFirstOfSequence(tet, doc, pageNumber)) {
String criterion = getRoutingCriterion(tet, doc, pageNumber);
currentOutputDocument = fetchOutputDocument(criterion);
}
return currentOutputDocument;
}
/*
* Process a page: Determine into which output document the current page should
* be placed, and put it into the output document.
*
* @param tet TET object
* @param doc TET document handle
* @param p pdflib object
* @param pdiHandle PDI document handle
* @param pageno The current page number
* @throws TETException An error occurred in the TET API
* @throws PDFlibException An error occurred in the PDFlib API
*/
private void process_page(TET tet, final int doc, int pageno) throws TETException, PDFlibException {
final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);
if (page == -1) {
System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
} else {
/*
* Decide about routing the input pages
*/
output_document o = routePage(tet, doc, pageno);
/*
* Copy page from input document to output document.
*/
importPdiPage(o, pageno);
/*
* Close page in the input document.
*/
tet.close_page(page);
}
}
private void execute() {
TET tet = null;
int pageno = 0;
try {
tet = new TET();
tet.set_option(GLOBAL_OPTLIST);
final int doc = tet.open_document(infilename, DOC_OPTLIST);
if (doc == -1) {
System.err
.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
return;
}
/*
* Loop over pages in the document
*/
final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
for (pageno = 1; pageno <= n_pages; ++pageno) {
process_page(tet, doc, pageno);
}
/*
* Close all output documents
*/
Collection<output_document> values = outputDocuments.values();
Iterator<output_document> i = values.iterator();
while (i.hasNext()) {
output_document o = (output_document) i.next();
o.p.end_document("");
o.p.close_pdi_document(o.pdiHandle);
out.println("Closed output document \"" + o.filename + "\"");
}
tet.close_document(doc);
} catch (TETException e) {
if (pageno == 0) {
System.err
.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
} else {
System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": "
+ e.get_errmsg() + "\n");
}
} catch (PDFlibException e) {
if (pageno == 0) {
System.err
.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
} else {
System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": "
+ e.get_errmsg() + "\n");
}
} finally {
tet.delete();
Collection<output_document> values = outputDocuments.values();
Iterator<output_document> i = values.iterator();
while (i.hasNext()) {
output_document o = (output_document) i.next();
o.p.delete();
}
}
}
/*
* @param infilename the name of the file for which the bookmarked file will be
* generated
* @param outfilename the name of the output file
*/
private burst(String infilename, String outfilename) {
this.infilename = infilename;
/*
* As the input document will be split into multiple output documents, strip a
* potential ".pdf" suffix from the name.
*/
int basenameEnd = outfilename.toLowerCase().lastIndexOf(".pdf");
this.outfileBasename = basenameEnd == -1 ? outfilename : outfilename.substring(0, basenameEnd);
}
public static void main(String[] args) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
if (args.length != 2) {
out.println("usage: burst <infilename> <outfile basename>");
return;
}
burst t = new burst(args[0], args[1]);
t.execute();
}
}