PDFlib

highlight_fonts

Font highlighting: Search for all fonts that are not ignored (option "-ignorefonts") or that are explicitly identified (option "-includefonts"), and make them visible with the "Highlight" annotation.

Download Java Code     Show Output     Show Input PDF

package com.pdflib.cookbook.tet.tet_and_pdflib;


import java.io.PrintStream;

import java.io.UnsupportedEncodingException;

import java.util.Collection;

import java.util.Iterator;

import java.util.LinkedList;

import java.util.List;

import java.util.Set;

import java.util.StringTokenizer;

import java.util.TreeSet;


import com.pdflib.PDFlibException;

import com.pdflib.TET;

import com.pdflib.TETException;

import com.pdflib.pdflib;


/**

 * Font highlighting: Search for all fonts that are not ignored (option

 * "-ignorefonts") or that are explicitly identified (option "-includefonts"),

 * and make them visible with the "Highlight" annotation.

 * <p>

 * Required software: TET 3 and PDFlib+PDI 8

 * <p>

 * Required data: PDF document

 *

 * @version $Id: highlight_fonts.java,v 1.7 2015/12/03 13:53:17 stm Exp $

 */

class highlight_fonts {

    /**

     * Common search path for PDI and TET to find the input document.

     */

    private static final String DOC_SEARCH_PATH = "../input";


    /**

     * Global option list. The program expects the "resource" directory parallel

     * to the "java" directory.

     */

    private static final String GLOBAL_OPTLIST =

        "searchpath={../resource/cmap ../resource/glyphlist "

            + DOC_SEARCH_PATH + "}";


    /**

     * Document specific option list.

     */

    private static final String DOC_OPTLIST = "";


    /**

     * Page-specific option list.

     */

    private static final String PAGE_OPTLIST = "granularity=page";


    /**

     * Command line flag for fonts to ignore.

     */

    private static final String IGNORE_OPT = "-ignorefonts";


    /**

     * Command line flag for fonts to include.

     */

    private final static String INCLUDE_OPT = "-includefonts";


    /**

     * The encoding in which the output is sent to System.out. For running the

     * example in a Windows command window, you can set this for example to

     * "windows-1252" for getting Latin-1 output.

     */

    private static final String OUTPUT_ENCODING = System

            .getProperty("file.encoding");


    /**

     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.

     */

    private static PrintStream out;


    /**

     * The name of the input file

     */

    private String infilename;


    /**

     * The name of the output file

     */

    private String outfilename;


    /**

     * The list of fonts that are either included or ignored, depending on the

     * value of member "ignore".

     */

    private Set<String> fonts;


    /**

     * If ignore is true, only the fonts not present in the font list are

     * highlighted. If ignore is false, only the fonts in the fonts list are

     * highlighted.

     */

    private boolean ignore;


    /**

     * Nudge factor for ascender height of the annotations (relative to the font

     * size)

     */

    private static final double ASCENDER = 0.85;


    /**

     * Nudge factor for descender height of annotations (relative to the font

     * size)

     */

    private static final double DESCENDER = 0.25;


    /**

     * Import the current page from the PDI import document and place it in the

     * ouput document.

     *

     * @param p

     *            the pdflib object

     * @param pdiHandle

     *            the PDI handle for the input document

     * @param pageno

     *            the current page number

     *

     * @throws PDFlibException

     *             an error occurred in the PDFlib API

     */

    private boolean importPdiPage(pdflib p, int pdiHandle, int pageno)

            throws PDFlibException {

        /*

         * The page size will be adjusted later to match the size of the input

         * pages

         */

        p.begin_page_ext(10, 10, "");

        int pdiPage = p.open_pdi_page(pdiHandle, pageno, "");


        if (pdiPage == -1) {

            System.err.println("Error: " + p.get_errmsg());

            return false;

        }


        /* Place the input page and adjust the page size */

        p.fit_pdi_page(pdiPage, 0, 0, "adjustpage");

        p.close_pdi_page(pdiPage);


        return true;

    }


    /**

     * Whether to include the font in the output.

     *

     * @param tet

     *            The TET object

     * @param doc

     *            The TET document handle

     * @param pcosId

     *            The pCOS id of the font to check

     *

     * @return true if the font has to be included in the output, otherwise

     *         false

     * @throws TETException

     *             An error occurred in the TET API

     */

    private boolean includeFontInOutput(TET tet, int doc, int pcosId)

            throws TETException {

        String fontName = getFontName(tet, doc, pcosId);

        return ignore != fonts.contains(fontName);

    }


    /**

     * Get the font name for the pCOS id of a font

     *

     * @param tet

     *            The TET object

     * @param doc

     *            The TET document handle

     * @param pcosId

     *            The pCOS id of the font to check

     * @return The name of the font

     * @throws TETException

     *             An error occurred in the TET API

     */

    private String getFontName(TET tet, int doc, int pcosId)

            throws TETException {

        String fontName = tet.pcos_get_string(doc, "fonts["

                + pcosId + "]/name");

        return fontName;

    }


    /**

     * Helper class to store rectangle data.

     */

    private class rectangle {

        rectangle(double llx, double lly, double urx, double ury) {

            this.llx = llx;

            this.lly = lly;

            this.urx = urx;

            this.ury = ury;

        }


        double llx;

        double lly;

        double urx;

        double ury;

    }


    /**

     * Create annotations for a given list of rectangles.

     *

     * @param tet

     *            The TET object

     * @param doc

     *            The TET handle

     * @param p

     *            The pdflib object

     * @param rectangles

     *            The list of rectangles

     * @throws TETException

     *             An error occurred in the TET API

     * @throws PDFlibException

     *             An error occurred in the PDFlib API

     */

    private void create_annotations(TET tet, final int doc, pdflib p,

            List<rectangle> rectangles, int fontId) throws TETException, PDFlibException {


        StringBuffer optlist = new StringBuffer(

                "annotcolor {rgb 0.68 0.85 0.90} linewidth 1 ")

                .append("title {TET/PDFlib Font Highlighting} ")

                .append("contents {Font: ")

                .append(getFontName(tet, doc, fontId))

                .append("} polylinelist {");


        /*

         * Build the option list for the highlight annotation,

         * including the "polylinelist" option that describes one or

         * multiple rectangles for the highlighting annotation for

         * the potentially hyphenated word.

         *

         * We still need the rectangle that surrounds the separate

         * sub-rectangles of the annotation, for passing it to the

         * function create_annotation(). To get the actual values,

         * we start with impossible values and compute the minimum

         * and maximum accross the relevant values.

         */

        double minx = 1E10, miny = 1E10, maxx = -1, maxy = -1;


        Iterator<rectangle> i = rectangles.iterator();

        while (i.hasNext()) {

            /*

             * The quadrilaterals have to be built in the following

             * order: upper left corner -> upper right corner -> lower

             * left corner -> lower right corner

             */

            rectangle r = (rectangle) i.next();


            minx = Math.min(minx, r.llx);

            miny = Math.min(miny, r.lly);

            maxx = Math.max(maxx, r.urx);

            maxy = Math.max(maxy, r.ury);


            optlist.append("{");


            // upper left corner

            optlist.append(r.llx).append(" ").append(r.ury);

            // upper right corner

            optlist.append(" ").append(r.urx).append(" ").append(r.ury);

            // lower left corner

            optlist.append(" ").append(r.llx).append(" ").append(r.lly);

            // lower right corner

            optlist.append(" ").append(r.urx).append(" ").append(r.lly);


            optlist.append("} ");

        }

        optlist.append("}");


        p.create_annotation(minx, miny, maxx, maxy, "Highlight",

                optlist.toString());

    }


    /**

     * Process a page: Create a new page in the output document, place the page

     * from the input document in the output document, and highlight the

     * relevant text.

     *

     * @param tet

     *            TET object

     * @param doc

     *            TET document handle

     * @param p

     *            pdflib object

     * @param pdiHandle

     *            PDI document handle

     * @param pageno

     *            The current page number

     * @throws TETException

     *             An error occurred in the TET API

     * @throws PDFlibException

     *             An error occurred in the PDFlib API

     */

    private void process_page(TET tet, final int doc, pdflib p, int pdiHandle,

            int pageno) throws TETException, PDFlibException {

        /*

         * Copy page from input document to output document.

         */

        importPdiPage(p, pdiHandle, pageno);


        final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);


        if (page == -1) {

            System.err.println("Error " + tet.get_errnum() + " in "

                    + tet.get_apiname() + "(): " + tet.get_errmsg());

        }

        else {

            /* Retrieve all text fragments for the page */

            for (String text = tet.get_text(page); text != null; text = tet

                    .get_text(page)) {

                /*

                 * List for collecting the rectangles that belong to an instance

                 * of the search term

                 */

                List<rectangle> rectangles = new LinkedList<rectangle>();


                double llx = 0, lly = 0, urx = 0, ury = 0, lasty = 0;

                int fontId = -1;


                /*

                 * Loop over all characters, watch the y position for a jump

                 * and the font id for a change to detect word fragments that

                 * have the same font. Recangles from multiple lines that have

                 * the same font belong to a common annotation.

                 */

                boolean inHighlightSequence = false;

                while (tet.get_char_info(page) != -1) {

                    boolean jumped = lasty != tet.y;

                    boolean fontChange = fontId != tet.fontid;


                    if (jumped || fontChange) {

                        if (inHighlightSequence) {

                            /*

                             * y value jumped or font changed, we have to start

                             * a new rectangle

                             */

                            rectangles.add(new rectangle(llx, lly, urx, ury));


                            /*

                             * If the font changed, the current annotation is

                             * complete.

                             */

                            if (fontChange) {

                                create_annotations(tet, doc, p, rectangles,

                                        fontId);

                                rectangles = new LinkedList<rectangle>();

                            }

                        }

                        inHighlightSequence =

                            includeFontInOutput(tet, doc, tet.fontid);


                        llx = tet.x;

                        lasty = tet.y;

                        lly = tet.y - DESCENDER * tet.fontsize;

                    }

                    fontId = tet.fontid;

                    urx = tet.x + tet.width;

                    ury = tet.y + ASCENDER * tet.fontsize;

                }


                /*

                 * Add the last identified rectangle.

                 */

                if (inHighlightSequence) {

                    rectangles.add(new rectangle(llx, lly, urx, ury));

                    create_annotations(tet, doc, p, rectangles, fontId);

                }

            }


            if (tet.get_errnum() != 0) {

                System.err.println("Error " + tet.get_errnum() + " in "

                        + tet.get_apiname() + "(): " + tet.get_errmsg());

            }


            /*

             * Close page in the input and output documents.

             */

            p.end_page_ext("");

            tet.close_page(page);

        }

    }


    /**

     * Join element of a collection into a string, delimeted by delimiter

     *

     * @param c

     *            Collection of items to join

     * @param delimiter

     *            Delimiter to put between the items

     * @return The joined string

     */

    public static String join(Collection<String> c, String delimiter) {

        StringBuffer buffer = new StringBuffer();

        Iterator<String> iter = c.iterator();

        while (iter.hasNext()) {

            buffer.append(iter.next());

            if (iter.hasNext()) {

                buffer.append(delimiter);

            }

        }

        return buffer.toString();

    }


    private void execute() {

        TET tet = null;

        pdflib p = null;

        int pageno = 0;


        try {

            tet = new TET();

            tet.set_option(GLOBAL_OPTLIST);


            p = new pdflib();

            p.set_option("searchpath={" + DOC_SEARCH_PATH + "}");


            if (p.begin_document(outfilename, "") == -1) {

                System.err.println("Error: " + p.get_errmsg());

                return;

            }


            /* add document info entries */

            p.set_info("Creator", "Highlight Fonts TET Cookbook Example");

            p.set_info("Author", "PDFlib GmbH");

            p.set_info("Title", infilename);

            String subjectFonts = join(fonts, ", ");

            String subject = (ignore ? "Ignored Fonts: " : "Included Fonts: ")

                    + subjectFonts;

            p.set_info("Subject", subject.toString());


            int pdiHandle = p.open_pdi_document(infilename, "");

            if (pdiHandle == -1) {

                System.err.println("Error: " + p.get_errmsg());

                return;

            }


            final int doc = tet.open_document(infilename, DOC_OPTLIST);

            if (doc == -1) {

                System.err.println("Error " + tet.get_errnum() + " in "

                        + tet.get_apiname() + "(): " + tet.get_errmsg());

                return;

            }


            /*

             * Loop over pages in the document

             */

            final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

            for (pageno = 1; pageno <= n_pages; ++pageno) {

                process_page(tet, doc, p, pdiHandle, pageno);

            }


            p.end_document("");

            p.close_pdi_document(pdiHandle);

            tet.close_document(doc);


            if (ignore) {

                out.println("Created PDF output document \"" + outfilename

                        + "\" with all fonts highlighted except: "

                        + subjectFonts);

            }

            else {

                out.println("Created PDF output document \"" + outfilename

                        + "\" with the following fonts highlighted: "

                        + subjectFonts);

            }

        }

        catch (TETException e) {

            if (pageno == 0) {

                System.err.println("Error " + e.get_errnum() + " in "

                        + e.get_apiname() + "(): " + e.get_errmsg() + "\n");

            }

            else {

                System.err.println("Error " + e.get_errnum() + " in "

                        + e.get_apiname() + "() on page " + pageno + ": "

                        + e.get_errmsg() + "\n");

            }

        }

        catch (PDFlibException e) {

            if (pageno == 0) {

                System.err.println("Error " + e.get_errnum() + " in "

                        + e.get_apiname() + "(): " + e.get_errmsg() + "\n");

            }

            else {

                System.err.println("Error " + e.get_errnum() + " in "

                        + e.get_apiname() + "() on page " + pageno + ": "

                        + e.get_errmsg() + "\n");

            }

        }

        finally {

            tet.delete();

            p.delete();

        }

    }


    /**

     * @param fonts

     *            The list of fonts to be either included or ignored

     * @param ignore

     *            If ignore is true, only the fonts not present in the font list

     *            are highlighted. If ignore is false, only the fonts in the

     *            fonts list are highlighted.

     * @param infilename

     *            The name of the file for which the file with highlighted text

     *            will be generated

     * @param outfilename

     *            The name of the output file

     */

    private highlight_fonts(Set<String> fonts, boolean ignore, String infilename,

            String outfilename) {

        this.infilename = infilename;

        this.outfilename = outfilename;

        this.fonts = fonts;

        this.ignore = ignore;

    }


    /**

     * Splits the list of font names and generates a Set of font names from

     * them.

     *

     * @param fontList

     *            A comma-separated list of font names.

     *

     * @return A Set containing the elements of the font list

     */

    private static Set<String> parse_font_list(String fontList) {

        Set<String> retval = new TreeSet<String>();


        StringTokenizer tokenizer = new StringTokenizer(fontList, ",");


        while (tokenizer.hasMoreTokens()) {

            retval.add(tokenizer.nextToken());

        }


        return retval;

    }


    public static void main(String[] args) throws UnsupportedEncodingException {

        System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");

        out = new PrintStream(System.out, true, OUTPUT_ENCODING);


        if (args.length != 4

                || !(args[0].equals(IGNORE_OPT)

                        || args[0].equals(INCLUDE_OPT))) {

            usage();

        }


        Set<String> fonts = parse_font_list(args[1]);


        highlight_fonts t = new highlight_fonts(fonts, args[0].equals(IGNORE_OPT),

                    args[2], args[3]);

        t.execute();

    }


    private static void usage() {

        System.err.println("usage: highlight_fonts [ -ignorefonts <font list> | "

            + " -includefonts <font list> ] <input document> <output document>");

        System.exit(1);

    }

}