special/extract_highlighted_text
Extract text under Highlight annotations.
Download Java Code Show Output Show Input (TET-datasheet.pdf)
/*
* Extract highlighted text: Identify all "Highlight" annotations and extract
* exactly the text under the annotation (i.e. all highlighted text).
* The name of the user who created the annotation is also emitted.
*
* You can interactively copy the highlighted text in Acrobat DC by clicking
* on the Highlight annotation and selecting "Copy Text" in the context menu.
*
* Required software: TET 5
*
* Required data: PDF document with Highlight annotations
*
* Restrictions:
* - We use the annotation rectangle to identify the text area. However,
* the /QuadPoints entry may be used to collect multiple rectangles in a
* single annotation for multi-line text. Because of this simplification
* too much text may be extracted if the highlighted lines have different
* lengths.
* - Acrobat creates highlights which are slightly larger than the highlighted
* text. As a result too much text can be extracted if the adjacent glyph
* sits close to the annotation. We avoid this by slightly reducing the
* annotation width.
*/
package com.pdflib.cookbook.tet.special;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import com.pdflib.TET;
import com.pdflib.TETException;
public class extract_highlighted_text
{
/*
* Global option list
*/
static final String globaloptlist =
"searchpath={../resource/cmap " + "../resource/glyphlist ../input}";
/*
* Document-specific option list
*/
static final String docoptlist = "";
/*
* Page-specific option list
*/
static final String pageoptlist = "granularity=page";
/*
* Separator to emit after each chunk of text. This depends on the
* applications needs; for granularity=word a space character may be useful.
*/
static final String SEPARATOR = "\n";
/*
* Nudge value for the annotation width (in points). This avoids
* problems where a glyph would incorrectly be considered as highlighted
* because its reference point sits very close to the annotation border.
* Acrobat usually expands the rectangle beyond the actual highlighted text.
*/
private static final double REFPOINT_NUDGE = 2;
/*
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");
/*
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
public static void main (String argv[]) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
TET tet = null;
try
{
if (argv.length != 1)
{
throw new Exception(
"usage: extract_highlighted_text <filename>");
}
tet = new TET();
tet.set_option(globaloptlist);
int doc = tet.open_document(argv[0], docoptlist);
if (doc == -1)
{
throw new Exception("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "(): " + tet.get_errmsg());
}
/* Get number of pages in the document */
int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
/* Loop over pages in the document */
for (int pageno = 1; pageno <= n_pages; ++pageno)
{
String text;
int page;
int anncount; /* number of annotations on the page */
int highlight_count = 0; /* number of highlight annotations */
/*
* Use pCOS to identify Highlight annotations.
* For each Highlight annotation its rectangle coordinates are
* retrieved and used as includebox for extracting the text.
*/
anncount = (int) tet.pcos_get_number(doc, "length:pages[" + (pageno-1) + "]/annots");
for (int i=0; i<anncount; i++)
{
String annotation_path = "pages[" + (pageno-1) + "]/annots["+ i + "]";
if (tet.pcos_get_string(doc, annotation_path + "/Subtype").equals("Highlight"))
{
String rect_path = annotation_path + "/Rect";
if (!tet.pcos_get_string(doc, "type:" + rect_path).equals("array") ||
(int) tet.pcos_get_number(doc, "length:" + rect_path) != 4)
{
out.print("Invalid or missing /Rect entry in Highlight annotation " + (highlight_count+1) + " (annotation ignored)\n");
continue; /* continue with next annotation */
}
highlight_count++;
String rect = "{";
for (int j=0; j < 4; j++)
{
double val = tet.pcos_get_number(doc, rect_path + "[" + j + "]");
/* Slightly reduce the annnotation rectangle width
* (see comment for REFPOINT_NUDGE above)
*/
if (j==2) /* adjust urx value (=right edge) */
val -= REFPOINT_NUDGE;
rect = rect + val + " ";
}
rect = rect + " }";
/* Retrieve the annotation title which contains the
* name of the user who created the annotation.
* Acrobat always creates the title, but just in case
* we initialize it with a fallback value.
* You could also use this string to filter annotations
* by the author who created them.
*/
String title = "(unknown)";
if (tet.pcos_get_string(doc, "type:" + annotation_path + "/T").equals("string"))
title = tet.pcos_get_string(doc, annotation_path + "/T");
/*
* Use the "includebox" option with the Highlight rectangle
* coordinates to limit text extraction to this area on the page.
* We open the page repeatedly for each annotation so that we
* can distinguish the contents of each Highlight annotation.
*/
page = tet.open_page(doc, pageno, pageoptlist + " includebox={" + rect + "}");
if (page == -1)
{
print_tet_error(tet, pageno);
tet.close_page(page);
break; /* page is damaged; continue with next page */
}
/*
* Retrieve and print the highlighted text
*/
if ((text = tet.get_text(page)) != null)
{
/* print the retrieved text */
out.print("+++++ Highlighted text " + highlight_count + " on page " + pageno +
" (highlighted by '" + title + "'):\n");
out.print(text);
/* print a separator between chunks of text */
out.print(SEPARATOR);
}
if (tet.get_errnum() != 0)
{
print_tet_error(tet, pageno);
}
tet.close_page(page);
}
}
if (highlight_count == 0)
out.print("+++++ No Highlight annotations on page " + pageno + "\n");
}
tet.close_document(doc);
}
catch (TETException e)
{
System.err.println("TET exception occurred in extractor sample:");
System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() +
": " + e.get_errmsg());
}
catch (Exception e)
{
System.err.println(e);
}
finally
{
if (tet != null) {
tet.delete();
}
}
}
/*
* Report a TET error.
*
* @param tet The TET object
* @param pageno The page number on which the error occurred
*/
private static void print_tet_error(TET tet, int pageno)
{
System.err.println("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "() on page " + pageno + ": "
+ tet.get_errmsg());
}
}