TET Cookbook

cookbook

tet_and_pdflib/highlight_artifacts

Use TET and PDFlib to search for text and image Artifacts and make them visible with the "Highlight" annotation

Download Java Code  Show Output  Show Input (invoice_pdfua1.pdf) 

/*
 * Search text and image Artifacts: Identify all Artifacts (irrelevant content)
 * and make them visible with the "Highlight" annotation.
 * 
 * Required software: TET 5.2 and PDFlib+PDI 9
 * 
 * Required data: PDF document
 * 
 * Artifact highlighting is subject to some restrictions to simplify the code:
 * - If one or more glyphs in a word are Artifacts, the whole word is
 *   highlighted.
 * - Rotated text or images are highlighted in a simplified manner. 
 * - Artifact images which are merged into a larger image (consumed images)
 *   are not reported.
 */

package com.pdflib.cookbook.tet.tet_and_pdflib;

import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import com.pdflib.PDFlibException;
import com.pdflib.TET;
import com.pdflib.TETException;
import com.pdflib.pdflib;

class highlight_artifacts {
    /*
     * Common search path for PDI and TET to find the input document.
     */
    private static final String DOC_SEARCH_PATH = "../input";

    /*
     * Global option list. The program expects the "resource" directory parallel to
     * the "java" directory.
     */
    private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap ../resource/glyphlist " + DOC_SEARCH_PATH
            + "}";

    /*
     * Document specific option list.
     */
    private static final String DOC_OPTLIST = "";

    /*
     * Page-specific option list.
     * 
     * "contentanalysis={keephyphenglyphs}" is specified because we want to 
     * hightlight hyphen Artifacts.
     */
    private static final String PAGE_OPTLIST = "granularity=word contentanalysis={keephyphenglyphs nopunctuationbreaks}";

    /*
     * Nudge value for the glyph reference point (in points). This avoids
     * problems where a glyph would be considered "outside" the annotation
     * because of rounding problems although its reference point sits exactly
     * on the annotation border.
     */
    private static final double REFPOINT_NUDGE = 0.25;  

    /*
     * The encoding in which the output is sent to System.out. For running the
     * example in a Windows command window, you can set this for example to
     * "windows-1252" for getting Latin-1 output.
     */
    private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");

    /*
     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.
     */
    private static PrintStream out;

    /*
     * The name of the input file
     */
    private String infilename;

    /*
     * The name of the output file
     */
    private String outfilename;

    /*
     * Import the current page from the PDI import document and place it in the
     * ouput document.
     *
     * @param p         the pdflib object
     * @param pdiHandle the PDI handle for the input document
     * @param pageno    the current page number
     *
     * @throws PDFlibException an error occurred in the PDFlib API
     */
    private boolean importPdiPage(pdflib p, int pdiHandle, int pageno) throws PDFlibException {
        /*
         * The page size will be adjusted later to match the size of the input pages
         */
        p.begin_page_ext(10, 10, "");
        int pdiPage = p.open_pdi_page(pdiHandle, pageno, "");

        if (pdiPage == -1) {
            System.err.println("Error: " + p.get_errmsg());
            return false;
        }

        /* Place the input page and adjust the page size */
        p.fit_pdi_page(pdiPage, 0, 0, "adjustpage");
        p.close_pdi_page(pdiPage);

        return true;
    }

    /*
     * Helper class to store rectangle data.
     */
    private class rectangle {
        rectangle(double llx, double lly, double urx, double ury) {
            this.llx = llx;
            this.lly = lly;
            this.urx = urx;
            this.ury = ury;
        }

        double llx;
        double lly;
        double urx;
        double ury;
    }

    /*
     * Process a page: Create a new page in the output document, place the page from
     * the input document in the output document, and highlight all occurrences of
     * the search term.
     *
     * @param tet       TET object
     * @param doc       TET document handle
     * @param p         pdflib object
     * @param pdiHandle PDI document handle
     * @param pageno    The current page number
     * @throws TETException    An error occurred in the TET API
     * @throws PDFlibException An error occurred in the PDFlib API
     */
    private void process_page(TET tet, final int doc, pdflib p, int pdiHandle, int pageno)
            throws TETException, PDFlibException {
        /*
         * Copy page from input document to output document.
         */
        importPdiPage(p, pdiHandle, pageno);

        final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);

        if (page == -1) {
            System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
            return;
        } 

        int text_artifacts = 0, image_artifacts = 0;
        
        /* Retrieve all text fragments for the page and check for Artifacts */
        for (String text = tet.get_text(page); text != null; text = tet.get_text(page)) {
            /*
             * List for collecting the rectangles that belong to an Artifact
             */
            List<rectangle> rectangles = new LinkedList<rectangle>();
            double llx = 0, lly = 0, urx = 0, ury = 0;
            
            Boolean found_artifact = false; /* found one or more Artifact glyphs in a text chunk */
            
            /* Loop over all glyphs in the text chunk and check for Artifacts */
            while (tet.get_char_info(page) != -1) {
                /*
                 * Get ascender and descender, which are expressed relative to a font scaling
                 * factor of 1000. Descender will be returned as a negative number, therefore it
                 * will be added to the baseline y position to get the lower left y value.
                 */
                final double descender = tet.pcos_get_number(doc, "fonts[" + tet.fontid + "]/descender") / 1000;
                final double ascender = tet.pcos_get_number(doc, "fonts[" + tet.fontid + "]/ascender") / 1000;

                if ((tet.attributes & TET.ATTR_ARTIFACT) != 0) {
                    found_artifact = true;
                    
                    /* Slightly expand the annotation to avoid rounding problems. */
                    llx = tet.x - REFPOINT_NUDGE;
                    lly = tet.y + descender * tet.fontsize;

                    urx = tet.x + tet.width;
                    ury = tet.y + ascender * tet.fontsize;
                    
                    /* Add the rectangle */
                    rectangles.add(new rectangle(llx, lly, urx, ury));
                }
            }

            if (found_artifact) {
                /*
                 * Build the option list for the highlight annotation.
                 *
                 * We determine the bounding box of the sub-rectangles of the
                 * annotation for create_annotation(). To get the actual values
                 * we start with impossible values and compute the minimum and
                 * maximum across the relevant values.
                 */
                double minx = 1E10, miny = 1E10, maxx = -1, maxy = -1;

                StringBuffer optlist = new StringBuffer("annotcolor=red linewidth 1 ")
                        .append("title={TET/PDFlib Artifact Highlighting} ").append("contents={Artifact text} ");

                Iterator<rectangle> i = rectangles.iterator();
                while (i.hasNext()) {
                    /* Determine the enclosing rectangle which will be used for the annotation */
                    rectangle r = (rectangle) i.next();
                    minx = Math.min(minx, r.llx);
                    miny = Math.min(miny, r.lly);
                    maxx = Math.max(maxx, r.urx);
                    maxy = Math.max(maxy, r.ury);
                }

                /* Count only full words, not individual glyphs */
                text_artifacts++;

                /* Slightly expand the annotation to avoid rounding problems. */
                p.create_annotation(minx - REFPOINT_NUDGE, miny - REFPOINT_NUDGE, maxx, maxy, "Highlight", optlist.toString());
            }
        }
        
        System.err.print("Page " + pageno + ": found " + text_artifacts + " text artifact chunk(s)");
        
        /* Retrieve all images on the page and check for Artifacts */
        while ((tet.get_image_info(page)) == 1)
        { 
            if ((tet.attributes & TET.ATTR_ARTIFACT) != 0) {
                image_artifacts++;
                
                StringBuffer optlist = new StringBuffer("annotcolor=red linewidth=1 ")
                        .append("title={TET/PDFlib Artifact Highlighting} ").append("contents={Artifact image (id ").append(tet.imageid)
                        .append(")} ");
                p.create_annotation(tet.x, tet.y, tet.x+tet.width, tet.y+tet.height, "Highlight", optlist.toString());
            }
        }
        
        if (image_artifacts > 0)
            System.err.println(" and " + image_artifacts + " image artifact(s)");
        else
            System.err.println();

        if (tet.get_errnum() != 0) {
            System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
        }

        /*
         * Close page in the input and output documents.
         */
        p.end_page_ext("");
        tet.close_page(page);
    }

    private void execute() {
        TET tet = null;
        pdflib p = null;
        int pageno = 0;

        try {
            tet = new TET();
            tet.set_option(GLOBAL_OPTLIST);

            p = new pdflib();
            p.set_option("searchpath={" + DOC_SEARCH_PATH + "}");

            if (p.begin_document(outfilename, "") == -1) {
                System.err.println("Error: " + p.get_errmsg());
                return;
            }

            /* add document info entries */
            p.set_info("Creator", "Highlight Artifacts TET Cookbook Example");
            p.set_info("Author", "PDFlib GmbH");
            p.set_info("Title", infilename);

            int pdiHandle = p.open_pdi_document(infilename, "");
            if (pdiHandle == -1) {
                System.err.println("Error: " + p.get_errmsg());
                return;
            }

            final int doc = tet.open_document(infilename, DOC_OPTLIST);
            if (doc == -1) {
                System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
                return;
            }

            /*
             * Loop over pages in the document
             */
            final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

            for (pageno = 1; pageno <= n_pages; ++pageno) {
                process_page(tet, doc, p, pdiHandle, pageno);
            }

            p.end_document("");
            p.close_pdi_document(pdiHandle);
            tet.close_document(doc);
        } catch (TETException e) {
            if (pageno == 0) {
                System.err
                        .println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
            } else {
                System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": "
                        + e.get_errmsg() + "\n");
            }
        } catch (PDFlibException e) {
            if (pageno == 0) {
                System.err
                        .println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
            } else {
                System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": "
                        + e.get_errmsg() + "\n");
            }
        } catch (Exception e) {
            System.err.println(e);
        } finally {
            tet.delete();
            p.delete();
        }
    }

    /*
     * @param infilename  the name of the file for which the file with highlighted
     *                    text will be generated
     * @param outfilename the name of the output file
     */
    private highlight_artifacts(String infilename, String outfilename) {
        this.infilename = infilename;
        this.outfilename = outfilename;
    }

    public static void main(String[] args) throws UnsupportedEncodingException {
        out = new PrintStream(System.out, true, OUTPUT_ENCODING);

        if (args.length != 2) {
            out.println("usage: highlight_search_terms <infilename> <outfilename>");
            return;
        }

        highlight_artifacts t = new highlight_artifacts(args[0], args[1]);
        t.execute();
    }
}