TET Cookbook

cookbook

text/text_from_annotations

Extract text from annotations with PDFlib TET and the pCOS interface.

Download Java Code  Show Output  Show Input (FontReporter.pdf) 

/*
 * Extract text from annotations with PDFlib TET and the pCOS interface
 * 
 * The topic "formfields" in the pCOS Cookbook demonstrates how to read
 * the values of form fields.
 * 
 * Required software: TET 5
 * 
 * Required data: PDF document
 * 
 */

package com.pdflib.cookbook.tet.text;

import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.util.Locale;

import com.pdflib.TETException;
import com.pdflib.TET;

public class text_from_annotations {
    /*
     * Global option list
     */
    static final String GLOBAL_OPTLIST = "searchpath={../input}";

    /*
     * Document-specific option list
     */
    static final String DOC_OPTLIST = "";

    /*
     * Page-specific option list
     */
    static final String PAGE_OPTLIST = "";

    /*
     * Separator to use as paragraph break. Text in PDF annotations uses
     * U+000D as paragraph separator which is impractical in many environments,
     * therefore we replace it.
     */
    static final String SEPARATOR = "\n";

    /*
     * The encoding in which the output is sent to System.out. For running the
     * example in a Windows command window, you can set this for example to
     * "windows-1252" for getting Latin-1 output.
     */
    private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");

    /*
     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.
     */
    private static PrintStream out;

    public static void main(String argv[]) throws UnsupportedEncodingException {
        System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
        out = new PrintStream(System.out, true, OUTPUT_ENCODING);

        TET tet = null;

        try {
            if (argv.length != 1) {
                throw new Exception("usage: text_from_annotations <filename>");
            }

            tet = new TET();
            tet.set_option(GLOBAL_OPTLIST);

            int doc = tet.open_document(argv[0], DOC_OPTLIST);
            if (doc == -1) {
                throw new Exception(
                        "Error " + tet.get_errnum() + "in " + tet.get_apiname() + "(): " + tet.get_errmsg());
            }

            out.print("File name: " + tet.pcos_get_string(doc, "filename") + "\n");

            int pagecount = (int) tet.pcos_get_number(doc, "length:pages");

            /* Loop over all pages */
            for (int page = 0; page < pagecount; page++) {
                String base_path = "pages[" + page + "]/annots";

                /* Get number of annotations on this page */
                int anncount = (int) tet.pcos_get_number(doc, "length:" + base_path);

                for (int ann = 0; ann < anncount; ann++) {
                    String objtype;

                    /* pCOS path for the next annotation: "pages[n]/annotations[m]" */
                    String annotation_path = base_path + "[" + ann + "]";
                    
                    String subtype = tet.pcos_get_string(doc, annotation_path + "/Subtype");
                    
                    /* Ignore form fields (=Widgets) */
                    if (subtype.equals("Widget")) {
                        continue;
                    }

                    /* Ignore annotations without any /Contents entry or empty Contents */
                    objtype = tet.pcos_get_string(doc, "type:" + annotation_path + "/Contents");
                    if (!objtype.equals("string")) {
                        continue;
                    }
                    
                    String contents = tet.pcos_get_string(doc, annotation_path + "/Contents");
                    if (contents.length() == 0)
                        continue;

                    /*
                     * Print the type of the annotation. You can use it to filter
                     * out unwanted annotation types. For example, annotation
                     * type "FreeText" is the only type which places text
                     * directly on the page.
                     */
                    System.out.print("page " + (page + 1) + ", annotation type: " + subtype + ", ");

                    /* Print the rectangle for the annotation. */
                    System.out.print("rectangle: ");
                    String rect_path = annotation_path + "/Rect";
                    if (tet.pcos_get_string(doc, "type:" + rect_path).equals("array")
                            && (int) tet.pcos_get_number(doc, "length:" + rect_path) == 4) {
                        System.out.print("[");

                        DecimalFormat format = new DecimalFormat();
                        format.setMinimumFractionDigits(0);
                        format.setMaximumFractionDigits(2);
                        format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US));

                        for (int i = 0; i < 4; i += 1) {
                            if (i > 0) {
                                System.out.print(" ");
                            }
                            System.out.print(format.format(tet.pcos_get_number(doc, rect_path  + "[" + i + "]")));
                        }
                        
                        out.print("]\n");
                    }
                    else {
                        out.print("(not available)\n");
                    }

                    /*
                     * Print contents of the annotation, i.e. the actual text.
                     * Text in PDF annotations uses U+000D as paragraph separator
                     * which is impractical in many environments, therefore we
                     * replace it.
                     */
                    contents = contents.replace("\r", SEPARATOR);
                    out.print("contents: '" + contents + "'\n");
                }
            }

            tet.close_document(doc);
        } catch (TETException e) {
            System.err.println("TET exception occurred in text_from_annotations sample:");
            System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() + ": " + e.get_errmsg());
            System.exit(1);
        } catch (Exception e) {
            System.err.println(e);
            System.exit(1);
        } finally {
            if (tet != null) {
                tet.delete();
            }
        }
    }
}