tet_and_pdflib/search_and_replace_text
Find text with TET, hide it with a white rectangle, and add the replacement text on top of it.
Download Java Code Show Output Show Input (Whitepaper-Technical-Introduction-to-PDFA.pdf)
/*
* Find text with TET, hide it with a white rectangle, and place some replacement
* text on top of it to approximate a search-and-replace operation. Note that
* the replaced text will still be retrievable from the output file.
*
* The program has a basic algorithm to handle fragmented words, e.g. hyphenated
* words or words with "drop caps". It is important to understand the
* limitations of this approach, as it will produce poor results in some
* situations. Hyphenations for the replacement word are most likely wrong, the
* white rectangle could be too large or too small, etc.
*
* Having said that, it is generally a bad idea to take this approach to replace
* text in existing PDF documents, and it should only be used when preparing
* print documents in certain situations, or as a last resort for online documents.
*
* Required software: TET 5.2 and PDFlib+PDI 9
*
* Required data: PDF document
*/
package com.pdflib.cookbook.tet.tet_and_pdflib;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.pdflib.PDFlibException;
import com.pdflib.TET;
import com.pdflib.TETException;
import com.pdflib.pdflib;
class search_and_replace_text {
/*
* Common search path for PDI and TET to find the input document.
*/
private static final String DOC_SEARCH_PATH = "../input";
/*
* Global option list. The program expects the "resource" directory parallel
* to the "java" directory.
*/
private static final String GLOBAL_OPTLIST =
"searchpath={../resource/cmap ../resource/glyphlist "
+ DOC_SEARCH_PATH + "}";
/*
* Document specific option list.
*/
private static final String DOC_OPTLIST = "";
/*
* Page-specific option list. The program uses granularity "word" because
* it matches word-wise with the regular expression defined in
* constant SEARCH_TERM_REGEX.
*
* "contentanalysis={keephyphenglyphs}" is specified because we want to
* capture the geometry of hyphens as well, in order to be able to
* overpaint them in the replacement color.
*/
private static final String PAGE_OPTLIST =
"granularity=word contentanalysis={keephyphenglyphs}";
/*
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System
.getProperty("file.encoding");
/*
* Because of rounding errors, there can be small variations in the
* baseline information. We use an epsilon value of 0.01 to ignore
* variations that are too small to be meaningful.
*/
private static final double BASELINE_EPSILON = 0.01;
/*
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
/*
* The name of the input file
*/
private String infilename;
/*
* The name of the output file
*/
private String outfilename;
/*
* The format for printing the x and y coordinate values.
*/
private NumberFormat coordFormat;
/*
* The search terms to replace, specified as a regular expression. In
* the example we search for "metadata", and replace it by its uppercase
* form.
*/
private static final Pattern SEARCH_TERM_REGEX =
Pattern.compile("(?i)metadata");
/*
* Font for replacement text.
*/
private static final String REPLACEMENT_FONT = "Times";
/*
* Counter for total replacements.
*/
private int replacements = 0;
/*
* Counter for fragmented words.
*/
private int fragmented = 0;
/*
* Set to true for more verbose output regarding the identified rectangles.
*/
private static boolean verbose = false;
/*
* Helper class to store rectangle data.
*/
private class rectangle {
rectangle(double baseline, double fontsize,
double llx, double lly, double urx, double ury, boolean hyphenated) {
this.llx = llx;
this.lly = lly;
this.urx = urx;
this.ury = ury;
this.baseline = baseline;
this.fontsize = fontsize;
this.hyphenated = hyphenated;
}
double width() {
return urx - llx;
}
double height() {
return ury - lly;
}
double llx;
double lly;
double urx;
double ury;
double fontsize;
double baseline;
boolean hyphenated;
}
/*
* Import the current page from the PDI import document and place it in the
* ouput document.
*
* @param p
* the pdflib object
* @param pdiHandle
* the PDI handle for the input document
* @param pageno
* the current page number
*
* @throws PDFlibException
* an error occurred in the PDFlib API
*/
private boolean importPdiPage(pdflib p, int pdiHandle, int pageno)
throws PDFlibException {
/*
* The page size will be adjusted later to match the size of the input
* pages
*/
p.begin_page_ext(10, 10, "");
int pdiPage = p.open_pdi_page(pdiHandle, pageno, "");
if (pdiPage == -1) {
System.err.println("Error: " + p.get_errmsg());
return false;
}
/* Place the input page and adjust the page size */
p.fit_pdi_page(pdiPage, 0, 0, "adjustpage");
p.close_pdi_page(pdiPage);
return true;
}
/*
* Split the matched word into fragments. A fragment is defined by
* having the same baseline and the same fontsize. As soon as one of
* these values changes, a new fragment starts.
*
* @param tet
* The TET object
* @param doc
* The TET document handle for the input document
* @param page
* The page handle for the current page
* @param pageno
* The number of the current page
* @param matchedText
* The currently matched word
*
* @return A List containing fragment rectangles
*
* @throws TETException
* An error occurred in the TET API
*/
private List<rectangle> analyze_word_fragments(TET tet, final int doc,
final int page, final int pageno, final String matchedText)
throws TETException {
List<rectangle> result = new LinkedList<rectangle>();
boolean first = true;
double llx = 0, lly = 0, urx = 0, ury = 0;
double baseline = 0, fontsize = 0;
/*
* Loop over all characters, watch the y position for a jump or a change
* in the fontsize to detect a word that spreads over two lines or split
* by other conditions, e.g. "drop caps".
*/
while (tet.get_char_info(page) != -1) {
/*
* Get ascender and descender, which are expressed relative to a
* font scaling factor of 1000. Descender will be returned as a
* negative number, therefore it will be added to the baseline y
* position to get the lower left y value.
*/
final double descender = tet.pcos_get_number(doc,
"fonts[" + tet.fontid + "]/descender") / 1000;
final double ascender = tet.pcos_get_number(doc,
"fonts[" + tet.fontid + "]/ascender") / 1000;
if (first) {
llx = tet.x;
baseline = tet.y;
fontsize = tet.fontsize;
lly = tet.y + descender * tet.fontsize;
first = false;
}
else if (Math.abs(baseline - tet.y) > BASELINE_EPSILON
|| fontsize != tet.fontsize) {
/*
* y value jumped or fontsize changed, so complete the previous
* rectangle. TET.ATTR_DEHYPHENATION_POST indicates that the
* previous character was a hyphenation artifact.
*/
boolean hyphenated = (tet.attributes & TET.ATTR_DEHYPHENATION_POST) != 0;
result.add(new rectangle(baseline, fontsize, llx, lly, urx, ury,
hyphenated));
baseline = tet.y;
fontsize = tet.fontsize;
llx = tet.x;
lly = tet.y + descender * tet.fontsize;
}
urx = tet.x + tet.width;
ury = tet.y + ascender * tet.fontsize;
}
/*
* Add the last identified rectangle, which can by definition not be
* hyphenated.
*/
result
.add(new rectangle(baseline, fontsize, llx, lly, urx, ury, false));
if (result.size() > 1) {
fragmented += 1;
System.err.println("Warning: On page " + pageno
+ " the search text \"" + matchedText + "\" extends over "
+ "multiple rectangles, starting at " + "x="
+ coordFormat.format(llx) + ", y=" + coordFormat.format(lly)
+ ", result is questionable.");
}
return result;
}
/*
* Paint the given rectangle in white.
*
* @param p
* The pdflib object
* @param pageno
* The number of the current page
* @param r
* The rectangle to paint
* @throws PDFlibException
* An error occurred in the PDFlib API
*/
private void paint_rectangle(pdflib p, int pageno, rectangle r)
throws PDFlibException {
p.save();
p.setcolor("fillstroke", "gray", 1, 0, 0, 0);
p.rect(r.llx, r.lly, r.width(), r.height());
p.fill();
p.restore();
if (verbose) {
out.println("Painted white rectangle at " + "x="
+ coordFormat.format(r.llx) + ", y="
+ coordFormat.format(r.lly) + ", width="
+ coordFormat.format(r.width()) + ", height="
+ coordFormat.format(r.height()));
}
}
/*
* Method that implements the actual replacement.
*
* @param matchedText
* The text to replace
* @return The replacement for the matchetText
*/
private String get_replacement_text(String matchedText) {
return matchedText.toUpperCase();
}
/*
* Paint the rectangles in white, and fill the rectangles sequentially with
* text, with the following strategy:
*
* - Put at least one character in a rectangle
* - If this is the last rectangle, fill in the rest of the text
* - Otherwise fill the rectangle by adding characters until the next
* character would exceed the rectangle
*
* @param font
* The font handle
* @param p
* The pdflib object
* @param pageno
* The number of the current page
* @param matchedText
* The matched text
* @param rectangles
* The list of word fragments to replace
*
* @throws PDFlibException
* An error occurred in the PDFlib API
*/
private void replace_fragments(int font, pdflib p, int pageno,
String matchedText, List<rectangle> rectangles) throws PDFlibException {
/*
* Compute the total length of the fragments.
*/
Iterator<rectangle> i = rectangles.iterator();
String replacementText = get_replacement_text(matchedText);
int replacementIndex = 0;
while (i.hasNext()) {
rectangle r = (rectangle) i.next();
paint_rectangle(p, pageno, r);
int matchedLength = matchedText.length();
int fragBegin = replacementIndex;
int fragEnd;
if (i.hasNext()) {
/*
* Not the last fragment, compute how man characters fit into
* the current rectangle.
*/
fragEnd = fragBegin;
String optlist = "font=" + font + " fontsize=" + r.fontsize;
double filledWidth = 0;
/*
* At least one character is put into the box, plus a hyphen
* if the original rectangle ended with a hyphen.
*/
do {
fragEnd += 1;
String fragment = matchedText.substring(fragBegin, fragEnd);
if (r.hyphenated) {
fragment += "-";
}
filledWidth = p.info_textline(fragment, "width", optlist);
}
while (filledWidth <= r.width() && fragEnd < matchedLength);
}
else {
/*
* The rest of the text.
*/
fragEnd = replacementText.length();
}
p.save();
/*
* The text must be positioned vertically at the same baseline as
* the original text.
*
* PDFlib calculates the scaling for the replacement text so it fits
* into the box (fitmethod=auto).
*
* The setcolor call is intended for highlighting the replacement
* text, delete this for getting the replacement text in the default
* color.
*/
p.setcolor("fillstroke", "rgb", 1, 0, 0, 0);
String replacementFragment =
replacementText.substring(fragBegin, fragEnd);
if (r.hyphenated) {
replacementFragment += "-";
}
String optlist = "font=" + font + " " + "boxsize={" + r.width()
+ " " + r.fontsize + "} " + "position={left bottom} "
+ "fitmethod=auto fontsize=" + r.fontsize + " "
+ "shrinklimit=65%";
p.fit_textline(replacementFragment, r.llx, r.baseline, optlist);
p.restore();
if (verbose) {
out.println("Replaced \"" + matchedText + "\" with \""
+ replacementText + "\"");
}
replacementIndex = fragEnd;
}
}
/*
* Check whether the given word matches the search term regular expression,
* analyze the geometry of the word, replace the fragments with white
* rectangles and put the replacement word into the fragments.
*
* @param tet
* The TET object
* @param doc
* The TET document handle for the input document
* @param font
* Font handle
* @param p
* pdflib object
* @param page
* Handle for the current page
* @param pageno
* The current page number
* @param word
* The current word that potentially will be replaced
*
* @throws TETException
* An error occurred in the TET API
* @throws PDFlibException
* An error occurred in the PDFlib API
*/
private void replace_text(final TET tet, final int doc, final int font,
final pdflib p, final int page,
final int pageno, final String word) throws TETException, PDFlibException {
/*
* Check whether this is text that we want to replace.
*/
Matcher matcher = SEARCH_TERM_REGEX.matcher(word);
if (matcher.matches()) {
replacements += 1;
String matchedText = matcher.group(0);
/*
* List for collecting the rectangles that belong to an instance of
* the search term
*/
List<rectangle> rectangles = analyze_word_fragments(tet, doc, page, pageno,
matchedText);
replace_fragments(font, p, pageno, matchedText, rectangles);
}
}
/*
* Process a page: Create a new page in the output document, place the page
* from the input document in the output document, and replace all
* occurrences of the search term with its uppercase form.
*
* @param tet
* TET object
* @param doc
* TET document handle
* @param font
* Font for replacement text
* @param p
* pdflib object
* @param pdiHandle
* PDI document handle
* @param pageno
* The current page number
* @throws TETException
* An error occurred in the TET API
* @throws PDFlibException
* An error occurred in the PDFlib API
*/
private void process_page(TET tet, final int doc, int font, pdflib p,
int pdiHandle, int pageno) throws TETException, PDFlibException {
/*
* Copy page from input document to output document.
*/
importPdiPage(p, pdiHandle, pageno);
final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);
if (page == -1) {
System.err.println("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "(): " + tet.get_errmsg());
}
else {
/* Retrieve all text fragments for the page */
for (String text = tet.get_text(page); text != null; text = tet
.get_text(page)) {
replace_text(tet, doc, font, p, page, pageno, text);
}
if (tet.get_errnum() != 0) {
System.err.println("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "(): " + tet.get_errmsg());
}
/*
* Close page in the input and output documents.
*/
p.end_page_ext("");
tet.close_page(page);
}
}
private void execute() {
TET tet = null;
pdflib p = null;
int pageno = 0;
try {
tet = new TET();
tet.set_option(GLOBAL_OPTLIST);
p = new pdflib();
p.set_option("searchpath={" + DOC_SEARCH_PATH + "}");
if (p.begin_document(outfilename, "") == -1) {
System.err.println("Error: " + p.get_errmsg());
return;
}
/* add document info entries */
p.set_info("Creator", "Search and Replace TET Cookbook Example");
p.set_info("Author", "PDFlib GmbH");
p.set_info("Title", infilename);
p.set_info("Subject", "Replace text matched by regex \""
+ SEARCH_TERM_REGEX.pattern()
+ "\" with its uppercase form" );
int pdiHandle = p.open_pdi_document(infilename, "");
if (pdiHandle == -1) {
System.err.println("Error: " + p.get_errmsg());
return;
}
/*
* Load font and set desired font size.
*/
int font = p.load_font(REPLACEMENT_FONT, "unicode", "");
if (font == -1) {
System.err.println("Error loading font: " + p.get_errmsg());
return;
}
final int doc = tet.open_document(infilename, DOC_OPTLIST);
if (doc == -1) {
System.err.println("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "(): " + tet.get_errmsg());
return;
}
/*
* Loop over pages in the document
*/
final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
for (pageno = 1; pageno <= n_pages; ++pageno) {
process_page(tet, doc, font, p, pdiHandle, pageno);
}
out.println("Replaced " + replacements + " words, "
+ fragmented + " words were fragmented");
p.end_document("");
p.close_pdi_document(pdiHandle);
tet.close_document(doc);
}
catch (TETException e) {
if (pageno == 0) {
System.err.println("Error " + e.get_errnum() + " in "
+ e.get_apiname() + "(): " + e.get_errmsg() + "\n");
}
else {
System.err.println("Error " + e.get_errnum() + " in "
+ e.get_apiname() + "() on page " + pageno + ": "
+ e.get_errmsg() + "\n");
}
}
catch (PDFlibException e) {
if (pageno == 0) {
System.err.println("Error " + e.get_errnum() + " in "
+ e.get_apiname() + "(): " + e.get_errmsg() + "\n");
}
else {
System.err.println("Error " + e.get_errnum() + " in "
+ e.get_apiname() + "() on page " + pageno + ": "
+ e.get_errmsg() + "\n");
}
}
finally {
tet.delete();
p.delete();
}
}
/*
* @param infilename
* the name of the file for which the file with replaced text
* will be generated
* @param outfilename
* the name of the output file
*/
private search_and_replace_text(String infilename, String outfilename) {
this.infilename = infilename;
this.outfilename = outfilename;
this.coordFormat = NumberFormat.getInstance();
coordFormat.setMinimumFractionDigits(0);
coordFormat.setMaximumFractionDigits(2);
}
public static void main(String[] args) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
if (args.length != 2) {
out.println("usage: search_and_replace_text <infilename> <outfilename>");
return;
}
search_and_replace_text t = new search_and_replace_text(args[0], args[1]);
t.execute();
}
}