special/extract_highlighted_text

Extract text under Highlight annotations.
Download Java Code Show Output Show Input (TET-datasheet.pdf)
/*
 * Extract highlighted text: Identify all "Highlight" annotations and extract
 * exactly the text under the annotation (i.e. all highlighted text).
 * The name of the user who created the annotation is also emitted.
 * 
 * You can interactively copy the highlighted text in Acrobat DC by clicking
 * on the Highlight annotation and selecting "Copy Text" in the context menu.
 * 
 * Required software: TET 5
 * 
 * Required data: PDF document with Highlight annotations
 * 
 * Restrictions:
 * - We use the annotation rectangle to identify the text area. However,
 *   the /QuadPoints entry may be used to collect multiple rectangles in a
 *   single annotation for multi-line text. Because of this simplification
 *   too much text may be extracted if the highlighted lines have different
 *   lengths.
 * - Acrobat creates highlights which are slightly larger than the highlighted
 *   text. As a result too much text can be extracted if the adjacent glyph
 *   sits close to the annotation. We avoid this by slightly reducing the
 *   annotation width.
 */

package com.pdflib.cookbook.tet.special;

import java.io.PrintStream;
import java.io.UnsupportedEncodingException;

import com.pdflib.TET;
import com.pdflib.TETException;

public class extract_highlighted_text
{
    /*
     * Global option list
     */
    static final String globaloptlist =
        "searchpath={../resource/cmap " + "../resource/glyphlist ../input}";
    
    /*
     * Document-specific option list
     */
    static final String docoptlist = "";
    
    /*
     * Page-specific option list
     */
    static final String pageoptlist = "granularity=page";
    
    /*
     * Separator to emit after each chunk of text. This depends on the
     * applications needs; for granularity=word a space character may be useful.
     */
    static final String SEPARATOR = "\n";
    
    /*
     * Nudge value for the annotation width (in points). This avoids
     * problems where a glyph would incorrectly be considered as highlighted
     * because its reference point sits very close to the annotation border.
     * Acrobat usually expands the rectangle beyond the actual highlighted text.
     */
    private static final double REFPOINT_NUDGE = 2;     

    /*
     * The encoding in which the output is sent to System.out. For running the
     * example in a Windows command window, you can set this for example to
     * "windows-1252" for getting Latin-1 output.
     */
    private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");

    /*
     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.
     */
    private static PrintStream out;

    public static void main (String argv[]) throws UnsupportedEncodingException {
        System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
        out = new PrintStream(System.out, true, OUTPUT_ENCODING);
        TET tet = null;
        
        try
        {
            if (argv.length != 1)
            {
                throw new Exception(
                    "usage: extract_highlighted_text <filename>");
            }

            tet = new TET();

            tet.set_option(globaloptlist);

            int doc = tet.open_document(argv[0], docoptlist);

            if (doc == -1)
            {
                throw new Exception("Error " + tet.get_errnum() + " in "
                        + tet.get_apiname() + "(): " + tet.get_errmsg());
            }
            
            /* Get number of pages in the document */
            int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

            /* Loop over pages in the document */
            for (int pageno = 1; pageno <= n_pages; ++pageno)
            {
                String text;
                int page;
                int anncount;	/* number of annotations on the page */
            	int highlight_count = 0; /* number of highlight annotations */

                /* 
                 * Use pCOS to identify Highlight annotations.
                 * For each Highlight annotation its rectangle coordinates are
                 * retrieved and used as includebox for extracting the text.
                 */
                anncount = (int) tet.pcos_get_number(doc, "length:pages[" + (pageno-1) + "]/annots");

                for (int i=0; i<anncount; i++)
                {
                	
                    String annotation_path = "pages[" + (pageno-1) + "]/annots["+ i + "]";
                    if (tet.pcos_get_string(doc, annotation_path + "/Subtype").equals("Highlight"))
                    {
                        String rect_path = annotation_path + "/Rect";
                        
                        if (!tet.pcos_get_string(doc, "type:" + rect_path).equals("array") ||
                        		(int) tet.pcos_get_number(doc, "length:" + rect_path) != 4)
                        {
                        	out.print("Invalid or missing /Rect entry in Highlight annotation " + (highlight_count+1) + " (annotation ignored)\n");
                        	continue; 	/* continue with next annotation */
                        }

                        highlight_count++;
                        String rect = "{";

                        for (int j=0; j < 4; j++)
                        {
                        	double val = tet.pcos_get_number(doc, rect_path + "[" + j + "]");
                            
                            /* Slightly reduce the annnotation rectangle width
                             * (see comment for REFPOINT_NUDGE above)
                             */
                            if (j==2)	/* adjust urx value (=right edge) */
                            	val -= REFPOINT_NUDGE;
                            
                            rect = rect + val  + " ";
                            	
                        }
                        rect = rect + " }";
                        
                        /* Retrieve the annotation title which contains the
                         * name of the user who created the annotation.
                         * Acrobat always creates the title, but just in case
                         * we initialize it with a fallback value.
                         * You could also use this string to filter annotations
                         * by the author who created them.
                         */
                        String title = "(unknown)";
                        if (tet.pcos_get_string(doc, "type:" + annotation_path + "/T").equals("string"))
                        	title = tet.pcos_get_string(doc, annotation_path + "/T");
                        
                        /* 
                         * Use the "includebox" option with the Highlight rectangle 
                         * coordinates to limit text extraction to this area on the page.
                         * We open the page repeatedly for each annotation so that we
                         * can distinguish the contents of each Highlight annotation.
                         */
                        page = tet.open_page(doc, pageno, pageoptlist + " includebox={" + rect + "}");

                        if (page == -1)
                        {
                            print_tet_error(tet, pageno);
                            tet.close_page(page);
                            break;  /* page is damaged; continue with next page */
                        }

                        /*
                         * Retrieve and print the highlighted text
                         */
                        if ((text = tet.get_text(page)) != null)
                        {
                            /* print the retrieved text */
                            out.print("+++++ Highlighted text " + highlight_count + " on page " + pageno +
                            		" (highlighted by '" + title + "'):\n");
                            out.print(text);

                            /* print a separator between chunks of text */
                            out.print(SEPARATOR);
                        }

                        if (tet.get_errnum() != 0)
                        {
                            print_tet_error(tet, pageno);
                        }

                        tet.close_page(page);
                    }
                }

                if (highlight_count == 0)
                	out.print("+++++ No Highlight annotations on page " + pageno + "\n");
            }

            tet.close_document(doc);
        }
        catch (TETException e)
        {
            System.err.println("TET exception occurred in extractor sample:");
            System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() +
                            ": " + e.get_errmsg());
        }
        catch (Exception e)
        {
            System.err.println(e);
        }
        finally
        {
            if (tet != null) {
                tet.delete();
            }
        }
    }

    /*
     * Report a TET error.
     * 
     * @param tet The TET object
     * @param pageno The page number on which the error occurred
     */
    private static void print_tet_error(TET tet, int pageno)
    {
        System.err.println("Error " + tet.get_errnum() + " in "
                + tet.get_apiname() + "() on page " + pageno + ": "
                + tet.get_errmsg());
    }
}
TET Cookbook

special/extract_highlighted_text