tet_and_pdflib/search_and_replace_text

Find text with TET, hide it with a white rectangle, and add the replacement text on top of it.
Download Java Code Show Output Show Input (Whitepaper-Technical-Introduction-to-PDFA.pdf)
/*
 * Find text with TET, hide it with a white rectangle, and place some replacement
 * text on top of it to approximate a search-and-replace operation. Note that
 * the replaced text will still be retrievable from the output file.
 * 
 * The program has a basic algorithm to handle fragmented words, e.g. hyphenated
 * words or words with "drop caps". It is important to understand the
 * limitations of this approach, as it will produce poor results in some
 * situations. Hyphenations for the replacement word are most likely wrong, the
 * white rectangle could be too large or too small, etc.
 * 
 * Having said that, it is generally a bad idea to take this approach to replace
 * text in existing PDF documents, and it should only be used when preparing
 * print documents in certain situations, or as a last resort for online documents.
 * 
 * Required software: TET 5.2 and PDFlib+PDI 9
 * 
 * Required data: PDF document
 */

package com.pdflib.cookbook.tet.tet_and_pdflib;

import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.pdflib.PDFlibException;
import com.pdflib.TET;
import com.pdflib.TETException;
import com.pdflib.pdflib;

class search_and_replace_text {
    /*
     * Common search path for PDI and TET to find the input document.
     */
    private static final String DOC_SEARCH_PATH = "../input";

    /*
     * Global option list. The program expects the "resource" directory parallel
     * to the "java" directory.
     */
    private static final String GLOBAL_OPTLIST =
        "searchpath={../resource/cmap ../resource/glyphlist "
            + DOC_SEARCH_PATH + "}";

    /*
     * Document specific option list.
     */
    private static final String DOC_OPTLIST = "";

    /*
     * Page-specific option list. The program uses granularity "word" because
     * it matches word-wise with the regular expression defined in
     * constant SEARCH_TERM_REGEX.
     * 
     * "contentanalysis={keephyphenglyphs}" is specified because we want to
     * capture the geometry of hyphens as well, in order to be able to
     * overpaint them in the replacement color.
     */
    private static final String PAGE_OPTLIST = 
        "granularity=word contentanalysis={keephyphenglyphs}";

    /*
     * The encoding in which the output is sent to System.out. For running the
     * example in a Windows command window, you can set this for example to
     * "windows-1252" for getting Latin-1 output.
     */
    private static final String OUTPUT_ENCODING = System
            .getProperty("file.encoding");

    /*
     * Because of rounding errors, there can be small variations in the
     * baseline information. We use an epsilon value of 0.01 to ignore
     * variations that are too small to be meaningful.
     */
    private static final double BASELINE_EPSILON = 0.01;
    
    /*
     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.
     */
    private static PrintStream out;

    /*
     * The name of the input file
     */
    private String infilename;

    /*
     * The name of the output file
     */
    private String outfilename;

    /*
     * The format for printing the x and y coordinate values.
     */
    private NumberFormat coordFormat;

    /*
     * The search terms to replace, specified as a regular expression. In
     * the example we search for "metadata", and replace it by its uppercase
     * form.
     */
    private static final Pattern SEARCH_TERM_REGEX =
                                    Pattern.compile("(?i)metadata");

    /*
     * Font for replacement text.
     */
    private static final String REPLACEMENT_FONT = "Times";

    /*
     * Counter for total replacements.
     */
    private int replacements = 0;

    /*
     * Counter for fragmented words.
     */
    private int fragmented = 0;

    /*
     * Set to true for more verbose output regarding the identified rectangles.
     */
    private static boolean verbose = false;

    /*
     * Helper class to store rectangle data.
     */
    private class rectangle {
        rectangle(double baseline, double fontsize,
                double llx, double lly, double urx, double ury, boolean hyphenated) {
            this.llx = llx;
            this.lly = lly;
            this.urx = urx;
            this.ury = ury;

            this.baseline = baseline;
            this.fontsize = fontsize;
            
            this.hyphenated = hyphenated;
        }

        double width() {
            return urx - llx;
        }

        double height() {
            return ury - lly;
        }

        double llx;
        double lly;
        double urx;
        double ury;

        double fontsize;
        double baseline;
        
        boolean hyphenated;
    }

    /*
     * Import the current page from the PDI import document and place it in the
     * ouput document.
     *
     * @param p
     *            the pdflib object
     * @param pdiHandle
     *            the PDI handle for the input document
     * @param pageno
     *            the current page number
     *
     * @throws PDFlibException
     *             an error occurred in the PDFlib API
     */
    private boolean importPdiPage(pdflib p, int pdiHandle, int pageno)
            throws PDFlibException {
        /*
         * The page size will be adjusted later to match the size of the input
         * pages
         */
        p.begin_page_ext(10, 10, "");
        int pdiPage = p.open_pdi_page(pdiHandle, pageno, "");

        if (pdiPage == -1) {
            System.err.println("Error: " + p.get_errmsg());
            return false;
        }

        /* Place the input page and adjust the page size */
        p.fit_pdi_page(pdiPage, 0, 0, "adjustpage");
        p.close_pdi_page(pdiPage);

        return true;
    }

    /*
     * Split the matched word into fragments. A fragment is defined by
     * having the same baseline and the same fontsize. As soon as one of
     * these values changes, a new fragment starts.
     *
     * @param tet
     *            The TET object
     * @param doc 
     *            The TET document handle for the input document
     * @param page
     *            The page handle for the current page
     * @param pageno
     *            The number of the current page
     * @param matchedText
     *            The currently matched word
     *
     * @return A List containing fragment rectangles
     *
     * @throws TETException
     *             An error occurred in the TET API
     */
    private List<rectangle> analyze_word_fragments(TET tet, final int doc,
        final int page, final int pageno, final String matchedText)
            throws TETException {
        List<rectangle> result = new LinkedList<rectangle>();
        boolean first = true;
        double llx = 0, lly = 0, urx = 0, ury = 0;
        double baseline = 0, fontsize = 0;

        /*
         * Loop over all characters, watch the y position for a jump or a change
         * in the fontsize to detect a word that spreads over two lines or split
         * by other conditions, e.g. "drop caps".
         */
        while (tet.get_char_info(page) != -1) {
            /*
             * Get ascender and descender, which are expressed relative to a
             * font scaling factor of 1000. Descender will be returned as a
             * negative number, therefore it will be added to the baseline y
             * position to get the lower left y value.
             */
            final double descender = tet.pcos_get_number(doc,
                "fonts[" + tet.fontid + "]/descender") / 1000;
            final double ascender = tet.pcos_get_number(doc,
                "fonts[" + tet.fontid + "]/ascender") / 1000;

            if (first) {
                llx = tet.x;
                baseline = tet.y;
                fontsize = tet.fontsize;
                lly = tet.y + descender * tet.fontsize;
                first = false;
            }
            else if (Math.abs(baseline - tet.y) > BASELINE_EPSILON
                || fontsize != tet.fontsize) {
                /*
                 * y value jumped or fontsize changed, so complete the previous
                 * rectangle. TET.ATTR_DEHYPHENATION_POST indicates that the
                 * previous character was a hyphenation artifact.
                 */
                boolean hyphenated = (tet.attributes & TET.ATTR_DEHYPHENATION_POST) != 0;
                result.add(new rectangle(baseline, fontsize, llx, lly, urx, ury,
                    hyphenated));
                baseline = tet.y;
                fontsize = tet.fontsize;
                llx = tet.x;
                lly = tet.y + descender * tet.fontsize;
            }

            urx = tet.x + tet.width;
            ury = tet.y + ascender * tet.fontsize;
        }

        /*
         * Add the last identified rectangle, which can by definition not be
         * hyphenated.
         */
        result
            .add(new rectangle(baseline, fontsize, llx, lly, urx, ury, false));

        if (result.size() > 1) {
            fragmented += 1;

            System.err.println("Warning: On page " + pageno
                + " the search text \"" + matchedText + "\" extends over "
                + "multiple rectangles, starting at " + "x="
                + coordFormat.format(llx) + ", y=" + coordFormat.format(lly)
                + ", result is questionable.");
        }

        return result;
    }

    /*
     * Paint the given rectangle in white.
     *
     * @param p
     *            The pdflib object
     * @param pageno
     *            The number of the current page
     * @param r
     *            The rectangle to paint
     * @throws PDFlibException
     *             An error occurred in the PDFlib API
     */
    private void paint_rectangle(pdflib p, int pageno, rectangle r)
            throws PDFlibException {
        p.save();
        p.setcolor("fillstroke", "gray", 1, 0, 0, 0);
        p.rect(r.llx, r.lly, r.width(), r.height());
        p.fill();
        p.restore();
        if (verbose) {
            out.println("Painted white rectangle at " + "x="
                    + coordFormat.format(r.llx) + ", y="
                    + coordFormat.format(r.lly) + ", width="
                    + coordFormat.format(r.width()) + ", height="
                    + coordFormat.format(r.height()));
        }
    }

    /*
     * Method that implements the actual replacement.
     *
     * @param matchedText
     *            The text to replace
     * @return The replacement for the matchetText
     */
    private String get_replacement_text(String matchedText) {
        return matchedText.toUpperCase();
    }

    /*
     * Paint the rectangles in white, and fill the rectangles sequentially with
     * text, with the following strategy:
     * 
     * - Put at least one character in a rectangle
     * - If this is the last rectangle, fill in the rest of the text
     * - Otherwise fill the rectangle by adding characters until the next
     * character would exceed the rectangle
     *
     * @param font
     *            The font handle
     * @param p
     *            The pdflib object
     * @param pageno
     *            The number of the current page
     * @param matchedText
     *            The matched text
     * @param rectangles
     *            The list of word fragments to replace
     *
     * @throws PDFlibException
     *             An error occurred in the PDFlib API
     */

    private void replace_fragments(int font, pdflib p, int pageno,
            String matchedText, List<rectangle> rectangles) throws PDFlibException {
        /*
         * Compute the total length of the fragments.
         */
        Iterator<rectangle> i = rectangles.iterator();
        String replacementText = get_replacement_text(matchedText);
        int replacementIndex = 0;
        while (i.hasNext()) {
            rectangle r = (rectangle) i.next();

            paint_rectangle(p, pageno, r);

            int matchedLength = matchedText.length();
            int fragBegin = replacementIndex;
            int fragEnd;

            if (i.hasNext()) {
                /*
                 * Not the last fragment, compute how man characters fit into
                 * the current rectangle.
                 */
                fragEnd = fragBegin;

                String optlist = "font=" + font + " fontsize=" + r.fontsize;
                double filledWidth = 0;

                /*
                 * At least one character is put into the box, plus a hyphen
                 * if the original rectangle ended with a hyphen.
                 */
                do {
                    fragEnd += 1;
                    
                    String fragment = matchedText.substring(fragBegin, fragEnd);
                    if (r.hyphenated) {
                        fragment += "-";
                    }
                    
                    filledWidth = p.info_textline(fragment, "width", optlist);
                }
                while (filledWidth <= r.width() && fragEnd < matchedLength);
            }
            else {
                /*
                 * The rest of the text.
                 */
                fragEnd = replacementText.length();
            }

            p.save();

            /*
             * The text must be positioned vertically at the same baseline as
             * the original text.
             *
             * PDFlib calculates the scaling for the replacement text so it fits
             * into the box (fitmethod=auto).
             *
             * The setcolor call is intended for highlighting the replacement
             * text, delete this for getting the replacement text in the default
             * color.
             */
            p.setcolor("fillstroke", "rgb", 1, 0, 0, 0);

            String replacementFragment = 
                        replacementText.substring(fragBegin, fragEnd);
            if (r.hyphenated) {
                replacementFragment += "-";
            }

            String optlist = "font=" + font + " " + "boxsize={" + r.width()
                    + " " + r.fontsize + "} " + "position={left bottom} "
                    + "fitmethod=auto fontsize=" + r.fontsize + " "
                    + "shrinklimit=65%";
            p.fit_textline(replacementFragment, r.llx, r.baseline, optlist);
            p.restore();
            if (verbose) {
                out.println("Replaced \"" + matchedText + "\" with \""
                        + replacementText + "\"");
            }

            replacementIndex = fragEnd;
        }
    }

    /*
     * Check whether the given word matches the search term regular expression,
     * analyze the geometry of the word, replace the fragments with white
     * rectangles and put the replacement word into the fragments.
     *
     * @param tet
     *            The TET object
     * @param doc 
     *            The TET document handle for the input document
     * @param font
     *            Font handle
     * @param p
     *            pdflib object
     * @param page
     *            Handle for the current page
     * @param pageno
     *            The current page number
     * @param word
     *            The current word that potentially will be replaced
     *
     * @throws TETException
     *             An error occurred in the TET API
     * @throws PDFlibException
     *             An error occurred in the PDFlib API
     */
    private void replace_text(final TET tet, final int doc, final int font, 
            final pdflib p, final int page,
            final int pageno, final String word) throws TETException, PDFlibException {
        /*
         * Check whether this is text that we want to replace.
         */
        Matcher matcher = SEARCH_TERM_REGEX.matcher(word);

        if (matcher.matches()) {
            replacements += 1;

            String matchedText = matcher.group(0);

            /*
             * List for collecting the rectangles that belong to an instance of
             * the search term
             */
            List<rectangle> rectangles = analyze_word_fragments(tet, doc, page, pageno,
                    matchedText);

            replace_fragments(font, p, pageno, matchedText, rectangles);
        }
    }

    /*
     * Process a page: Create a new page in the output document, place the page
     * from the input document in the output document, and replace all
     * occurrences of the search term with its uppercase form.
     *
     * @param tet
     *            TET object
     * @param doc
     *            TET document handle
     * @param font
     *            Font for replacement text
     * @param p
     *            pdflib object
     * @param pdiHandle
     *            PDI document handle
     * @param pageno
     *            The current page number
     * @throws TETException
     *             An error occurred in the TET API
     * @throws PDFlibException
     *             An error occurred in the PDFlib API
     */
    private void process_page(TET tet, final int doc, int font, pdflib p,
            int pdiHandle, int pageno) throws TETException, PDFlibException {
        /*
         * Copy page from input document to output document.
         */
        importPdiPage(p, pdiHandle, pageno);

        final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);

        if (page == -1) {
            System.err.println("Error " + tet.get_errnum() + " in "
                    + tet.get_apiname() + "(): " + tet.get_errmsg());
        }
        else {
            /* Retrieve all text fragments for the page */
            for (String text = tet.get_text(page); text != null; text = tet
                    .get_text(page)) {
                replace_text(tet, doc, font, p, page, pageno, text);
            }

            if (tet.get_errnum() != 0) {
                System.err.println("Error " + tet.get_errnum() + " in "
                        + tet.get_apiname() + "(): " + tet.get_errmsg());
            }

            /*
             * Close page in the input and output documents.
             */
            p.end_page_ext("");
            tet.close_page(page);
        }
    }

    private void execute() {
        TET tet = null;
        pdflib p = null;
        int pageno = 0;

        try {
            tet = new TET();
            tet.set_option(GLOBAL_OPTLIST);

            p = new pdflib();
            p.set_option("searchpath={" + DOC_SEARCH_PATH + "}");

            if (p.begin_document(outfilename, "") == -1) {
                System.err.println("Error: " + p.get_errmsg());
                return;
            }

            /* add document info entries */
            p.set_info("Creator", "Search and Replace TET Cookbook Example");
            p.set_info("Author", "PDFlib GmbH");
            p.set_info("Title", infilename);
            p.set_info("Subject", "Replace text matched by regex \""
                    + SEARCH_TERM_REGEX.pattern()
                    + "\" with its uppercase form" );

            int pdiHandle = p.open_pdi_document(infilename, "");
            if (pdiHandle == -1) {
                System.err.println("Error: " + p.get_errmsg());
                return;
            }

            /*
             * Load font and set desired font size.
             */
            int font = p.load_font(REPLACEMENT_FONT, "unicode", "");
            if (font == -1) {
                System.err.println("Error loading font: " + p.get_errmsg());
                return;
            }

            final int doc = tet.open_document(infilename, DOC_OPTLIST);
            if (doc == -1) {
                System.err.println("Error " + tet.get_errnum() + " in "
                        + tet.get_apiname() + "(): " + tet.get_errmsg());
                return;
            }

            /*
             * Loop over pages in the document
             */
            final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
            for (pageno = 1; pageno <= n_pages; ++pageno) {
                process_page(tet, doc, font, p, pdiHandle, pageno);
            }

            out.println("Replaced " + replacements + " words, "
                    + fragmented + " words were fragmented");

            p.end_document("");
            p.close_pdi_document(pdiHandle);
            tet.close_document(doc);
        }
        catch (TETException e) {
            if (pageno == 0) {
                System.err.println("Error " + e.get_errnum() + " in "
                        + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
            }
            else {
                System.err.println("Error " + e.get_errnum() + " in "
                        + e.get_apiname() + "() on page " + pageno + ": "
                        + e.get_errmsg() + "\n");
            }
        }
        catch (PDFlibException e) {
            if (pageno == 0) {
                System.err.println("Error " + e.get_errnum() + " in "
                        + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
            }
            else {
                System.err.println("Error " + e.get_errnum() + " in "
                        + e.get_apiname() + "() on page " + pageno + ": "
                        + e.get_errmsg() + "\n");
            }
        }
        finally {
            tet.delete();
            p.delete();
        }
    }

    /*
     * @param infilename
     *            the name of the file for which the file with replaced text
     *            will be generated
     * @param outfilename
     *            the name of the output file
     */
    private search_and_replace_text(String infilename, String outfilename) {
        this.infilename = infilename;
        this.outfilename = outfilename;

        this.coordFormat = NumberFormat.getInstance();
        coordFormat.setMinimumFractionDigits(0);
        coordFormat.setMaximumFractionDigits(2);
    }

    public static void main(String[] args) throws UnsupportedEncodingException {
        System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
        out = new PrintStream(System.out, true, OUTPUT_ENCODING);

        if (args.length != 2) {
            out.println("usage: search_and_replace_text <infilename> <outfilename>");
            return;
        }

        search_and_replace_text t = new search_and_replace_text(args[0], args[1]);
        t.execute();
    }
}