TET Cookbook

cookbook

tet_and_pdflib/create_table_of_contents

Use TET and PDFlib to create a table of contents (TOC) for the original document.

Download Java Code  Show Output  Show Input PDF 

package com.pdflib.cookbook.tet.tet_and_pdflib;

import java.io.PrintStream;
import java.io.UnsupportedEncodingException;

import com.pdflib.PDFlibException;
import com.pdflib.TET;
import com.pdflib.TETException;
import com.pdflib.pdflib;

/**
 * Extract some text from a PDF based on certain typographic criteria (font,
 * fontsize) along with the corresponding page numbers, and use PDFlib to create
 * a table of contents (TOC) for the original document, possibly enriched with
 * active links to the respective pages. With PDFlib+PDI the TOC could be
 * prepended to the original pages. With plain PDFlib a stand-alone TOC can be
 * created.
 * <p>
 * Required software: TET 3 and PDFlib+PDI 8 or PDFlib 8
 * <p>
 * Required data: PDF document
 * 
 */
class create_table_of_contents {
    /**
     * Common search path for PDI and TET to find the input document.
     */
    private static final String DOC_SEARCH_PATH = "../input";

    /**
     * Global option list. The program expects the "resource" directory parallel to
     * the "java" directory.
     */
    private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap ../resource/glyphlist " + DOC_SEARCH_PATH
            + "}";

    /**
     * Document specific option list.
     */
    private static final String DOC_OPTLIST = "";

    /**
     * Page-specific option list.
     */
    private static final String PAGE_OPTLIST = "granularity=page";

    /**
     * The encoding in which the output is sent to System.out. For running the
     * example in a Windows command window, you can set this for example to
     * "windows-1252" for getting Latin-1 output.
     */
    private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");

    /**
     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.
     */
    private static PrintStream out;

    /**
     * The name of the input file
     */
    private String infilename;

    /**
     * The name of the output file
     */
    private String outfilename;

    /**
     * The name of the font to search for.
     */
    private static final String FONT_NAME = "TheSansBold-Plain";

    /**
     * The font size to search for in points.
     */
    private static final double FONT_SIZE = 9;

    /**
     * The tolerance for the font size in points.
     */
    private static final double FONT_SIZE_TOLERANCE = 0.01;

    /**
     * Nudge factor for ascender height of the Web links (relative to the font size)
     */
    private static final double ASCENDER = 0.85;

    /**
     * Whether to use PDI to create a new document that consists of the original
     * document and the TOC prepended to it. Set this to false in order not to use
     * PDI, and in order to produce a document that only contains the TOC.
     */
    private static final boolean USE_PDI = true;

    /**
     * The page width for the TOC pages (see "width" option for begin_page_ext() in
     * the PDFlib Reference Manual)
     */
    private static final String TOC_WIDTH = "a4.width";

    /**
     * The page height for the TOC pages (see "height" option for begin_page_ext()
     * PDFlib Reference Manual)
     */
    private static final String TOC_HEIGHT = "a4.height";

    /**
     * The title for the TOC.
     */
    private static final String TOC_TITLE = "Table of Contents";

    /**
     * The font to use for the headline that is placed on each page of the TOC.
     */
    private static final String TOC_TITLE_FONT = "Helvetica-Bold";

    /**
     * The fontsize to use for the headline that is placed on each page of the TOC.
     */
    private static final int TOC_TITLE_FONTSIZE = 18;

    /**
     * The font to use for the TOC entries.
     */
    private static final String TOC_FONT = "Helvetica";

    /**
     * The fontsize for the TOC entries.
     */
    private static final int TOC_FONTSIZE = 12;

    /**
     * x-position of the lower-left corner of the TOC fitbox.
     */
    private static final int TOC_LLX = 110;

    /**
     * y-position of the lower-left corner of the TOC fitbox.
     */
    private static final int TOC_LLY = 100;

    /**
     * x-position of the upper-right corner of the TOC fitbox.
     */
    private static final int TOC_URX = 450;

    /**
     * y-position of the upper-right corner of the TOC fitbox.
     */
    private static final int TOC_URY = 700;

    /**
     * Lower-left y-position for the TOC headline.
     */
    private static final int TOC_TITLE_LLY = 740;

    /**
     * The prefix for a destination name.
     */
    private static final String TOC_DESTINATION_PREFIX = "tmx";

    /**
     * The text flow (including options) for the TOC contents.
     */
    private StringBuffer tocTextflow = new StringBuffer();

    /**
     * The current destination number. Used to generate unique destination names.
     */
    private int destNumber = 0;

    /**
     * Import the current page from the PDI import document and place it in the
     * ouput document.
     *
     * @param p         the pdflib object
     * @param pdiHandle the PDI handle for the input document
     * @param pageno    the current page number
     *
     * @throws PDFlibException an error occurred in the PDFlib API
     */
    private int put_pdi_page(pdflib p, int pdiHandle, int pageno) throws PDFlibException {
        /*
         * The page size will be adjusted later to match the size of the input pages
         */
        p.begin_page_ext(10, 10, "group content");

        int pageHandle = p.open_pdi_page(pdiHandle, pageno, "");

        if (pageHandle != -1) {
            /* Place the input page and adjust the page size */
            p.fit_pdi_page(pageHandle, 0, 0, "adjustpage");
        }

        return pageHandle;
    }

    /**
     * Tests whether the current character matches the criteria for text that shall
     * get a an entry in the TOC. get_char_info must have been called before in
     * order to ensure that the TET object contains the information for the current
     * character.
     * 
     * @param tet The TET object
     * @param doc The TET document handle
     * @throws TETException
     */
    private boolean font_matches(TET tet, final int doc) throws TETException {
        String name = tet.pcos_get_string(doc, "fonts[" + tet.fontid + "]/name");
        return name.equals(FONT_NAME) && (Math.abs(tet.fontsize - FONT_SIZE) <= FONT_SIZE_TOLERANCE);
    }

    /**
     * Add text and options to the textflow for the current TOC entry. The options
     * take care that the TOC will be properly formattted (do not split TOC entries
     * over page boundaries).
     * 
     * @param p       The pdflib object
     * @param tocText The text of the TOC entry to add
     * @param pageno  The page number for the TOC entry
     * @param ulx     x-position of the identified text on the page
     * @param uly     y-position of the identified text on the page
     * 
     * @throws PDFlibException An error occurred in the PDFlib API
     */
    private void add_toc_entry(pdflib p, String tocText, int pageno, double ulx, double uly) throws PDFlibException {
        /*
         * The same name is used for the matchbox name in the TOC and for the named
         * destination that is the target of the "GoTo" action in the TOC.
         */
        String destName = get_destination_name(destNumber);

        /*
         * We need pairwise marks that enclose the text that shall be kept together.
         */
        tocTextflow.append("<mark=").append(destNumber * 2).append(" alignment=left matchbox={name=").append(destName)
                .append(" boxheight={fontsize descender}}>").append(tocText).append("<leader={alignment={grid}}>\t")
                .append(pageno).append("<matchbox=end nextline mark=").append((destNumber * 2) + 1).append(">\n");

        if (USE_PDI) {
            p.add_nameddest(destName, "type=fixed left=" + ulx + " top=" + uly);
        }

        destNumber += 1;
    }

    /**
     * Create a unique name for each destination.
     * 
     * @param number The number of the destination.
     * 
     * @return A string that can be used as a destination name and as the
     *         corresponding matchbox name.
     */
    private String get_destination_name(int number) {
        return TOC_DESTINATION_PREFIX + number;
    }

    /**
     * Process a page: Create a new page in the output document, place the page from
     * the input document in the output document, and create TOC entries for all
     * occurrences of text with the desired properties.
     * 
     * @param tet       TET object
     * @param doc       TET document handle
     * @param p         pdflib object
     * @param pdiHandle PDI document handle
     * @param pageno    The current page number
     * @throws TETException    An error occurred in the TET API
     * @throws PDFlibException An error occurred in the PDFlib API
     */
    private void process_page(TET tet, final int doc, pdflib p, int pdiHandle, int pageno)
            throws TETException, PDFlibException {
        if (USE_PDI) {
            put_pdi_page(p, pdiHandle, pageno);
        }

        final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);

        if (page == -1) {
            System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
        } else {
            /* Retrieve all text fragments for the page */
            for (String text = tet.get_text(page); text != null; text = tet.get_text(page)) {
                int nextCharPos = 0;
                int matchStart = -1;

                double uly = 0;
                double ulx = 0;

                while (tet.get_char_info(page) != -1) {
                    nextCharPos += 1;

                    if (font_matches(tet, doc)) {
                        if (matchStart == -1) {
                            // start of new matching chunk
                            matchStart = nextCharPos - 1;
                            uly = tet.y + ASCENDER * tet.fontsize;
                            ulx = tet.x;
                        }
                    } else {
                        if (matchStart != -1) {
                            /*
                             * End of matching chunk. The last character that is belonging to the chunk is
                             * the one preceding the next character. matchEnd is the index of the first
                             * character after the bookmark characters.
                             */
                            int matchEnd = nextCharPos - 1;

                            /*
                             * remove trailing whitespace
                             */
                            while (matchEnd > matchStart && Character.isWhitespace(text.charAt(matchEnd - 1))) {
                                matchEnd -= 1;
                            }

                            String tocText = text.substring(matchStart, matchEnd);
                            out.println("Creating TOC entry \"" + tocText + "\"");
                            add_toc_entry(p, tocText, pageno, ulx, uly);
                            matchStart = -1;
                        }
                    }
                }
            }

            if (tet.get_errnum() != 0) {
                System.err
                        .println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
            }

            /*
             * Close page in the input and output documents.
             */
            if (USE_PDI) {
                p.end_page_ext("");
            }
            tet.close_page(page);
        }
    }

    private void create_toc(pdflib p) throws Exception {
        final String optlist = "fontname=" + TOC_FONT + " fontsize=" + TOC_FONTSIZE + " encoding=unicode ruler=100%"
                + " hortabmethod=ruler tabalignment=right";

        /*
         * Load the font for the title of the TOC
         */
        int font = p.load_font(TOC_TITLE_FONT, "unicode", "");

        if (font == -1)
            throw new Exception("Error: " + p.get_errmsg());

        /*
         * Create a textflow for the collected TOC entries.
         */
        int tf = p.create_textflow(tocTextflow.toString(), optlist);
        if (tf == -1)
            throw new Exception("Error: " + p.get_errmsg());

        /*
         * Maximum number of the mark defined in the Textflow
         */
        int maxMark = (destNumber * 2) - 1;

        /*
         * Keep track of the first mark for each page for creating the "GoTo" actions on
         * the page.
         */
        int startMark = 0;

        /*
         * Loop until all of the text is placed; create new pages as long as more text
         * needs to be placed.
         */
        String result;
        do {
            p.begin_page_ext(0, 0, "group=toc width=" + TOC_WIDTH + " height=" + TOC_HEIGHT);

            /* Place a text line with a title */
            p.setfont(font, TOC_TITLE_FONTSIZE);
            p.fit_textline(TOC_TITLE, TOC_LLX, TOC_TITLE_LLY, "");

            /*
             * Place the Textflow with the table of contents in blind mode, to find out how
             * many marks did fit into the box. With this information we also prevent a TOC
             * entry from being split over two pages by using the "returnatmark" option.
             */
            result = p.fit_textflow(tf, TOC_LLX, TOC_LLY, TOC_URX, TOC_URY, "blind");

            int lastMark = (int) p.info_textflow(tf, "lastmark");

            /*
             * An even mark number indicates the start of a text section to be kept
             * together. Reset it to the last odd mark number which indicates the end of a
             * text section.
             */
            if (lastMark % 2 == 0) {
                lastMark -= 1;
            }

            /*
             * Now actually fit the textflow. To rewind the textflow status to before the
             * last call to fit_textflow() use "rewind=-1".
             */
            result = p.fit_textflow(tf, TOC_LLX, TOC_LLY, TOC_URX, TOC_URY, "returnatmark=" + lastMark + " rewind=-1");

            /*
             * When PDI is in use, create the "GoTo" actions that allows to click on a TOC
             * entry for jumping to the corresponding section of the document.
             */
            for (int i = startMark; USE_PDI && i <= lastMark / 2; i += 1) {
                String destinationName = get_destination_name(i);
                int action = p.create_action("GoTo", "destname=" + destinationName);

                p.create_annotation(0, 0, 0, 0, "Link",
                        "action={activate " + action + "} linewidth=0 usematchbox={" + destinationName + "}");
            }

            startMark = (lastMark / 2) + 1;

            p.end_page_ext("");

            /*
             * "_boxfull" means we must continue because there is more text; "_nextpage" is
             * interpreted as "start new column"
             */
        } while (!result.equals("_stop") && !result.equals("_boxempty") && !result.equals("_mark" + maxMark));

        /* Check for errors */
        if (result.equals("_boxempty"))
            throw new Exception("Error: Textflow box for TOC is too small");

        p.delete_textflow(tf);
    }

    private void execute() {
        TET tet = null;
        pdflib p = null;
        int pageno = 0;

        try {
            tet = new TET();
            tet.set_option(GLOBAL_OPTLIST);

            p = new pdflib();
            p.set_option("searchpath={" + DOC_SEARCH_PATH + "}");

            if (p.begin_document(outfilename, "groups={toc content}") == -1) {
                System.err.println("Error: " + p.get_errmsg());
                return;
            }

            /* add document info entries */
            p.set_info("Creator", "Create Table Of Contents TET Cookbook Example");
            p.set_info("Author", "PDFlib GmbH");
            p.set_info("Title", infilename);

            final int doc = tet.open_document(infilename, DOC_OPTLIST);
            if (doc == -1) {
                System.err
                        .println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
                return;
            }

            final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
            int pdiHandle = -1;
            if (USE_PDI) {
                pdiHandle = p.open_pdi_document(infilename, "");
                if (pdiHandle == -1) {
                    System.err.println("Error: " + p.get_errmsg());
                    return;
                }
            }

            /*
             * Loop over pages in the document
             */
            for (pageno = 1; pageno <= n_pages; ++pageno) {
                process_page(tet, doc, p, pdiHandle, pageno);
            }

            /*
             * Use the information collected while processing the input document to create
             * the table of contents.
             */
            create_toc(p);

            p.end_document("");
            if (USE_PDI) {
                p.close_pdi_document(pdiHandle);
            }
            tet.close_document(doc);

            out.println("Created PDF output document \"" + outfilename + "\" with " + destNumber + " TOC entries.");
        } catch (TETException e) {
            if (pageno == 0) {
                System.err
                        .println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
            } else {
                System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": "
                        + e.get_errmsg() + "\n");
            }
        } catch (PDFlibException e) {
            if (pageno == 0) {
                System.err
                        .println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
            } else {
                System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": "
                        + e.get_errmsg() + "\n");
            }
        } catch (Exception e) {
            System.err.println(e);
        } finally {
            tet.delete();
            p.delete();
        }
    }

    /**
     * @param infilename  the name of the file for which the bookmarked file will be
     *                    generated
     * @param outfilename the name of the output file
     */
    private create_table_of_contents(String infilename, String outfilename) {
        this.infilename = infilename;
        this.outfilename = outfilename;
    }

    public static void main(String[] args) throws UnsupportedEncodingException {
        System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
        out = new PrintStream(System.out, true, OUTPUT_ENCODING);

        if (args.length != 2) {
            out.println("usage: create_table_of_contents <infilename> <outfilename>");
            return;
        }

        create_table_of_contents t = new create_table_of_contents(args[0], args[1]);
        t.execute();
    }
}