font/font_finder
Identify the locations in a PDF where a particular font is used. Print the page number, location, and start of text for each hit.
Download Java Code Show Output Show Input (TET-datasheet.pdf)
/*
* Identify the locations in a PDF where a particular font is used; print the
* page number, location, and start of text for each hit.
*
* usage: font_finder [ -ignorefonts <font list> | -includefonts <font
* list< ] <PDF document>
*
* A <font list> is a comma-separated list of font names. If neither
* -ignorefonts nor -includefonts is specified, all fonts are included. If
* -ignorefonts is specified, all fonts but the ignored ones are included. If
* -includefonts is specified, only the fonts in the specified font list are
* included.
*
* The application prints the coordinates in the same manner as Adobe Acrobat,
* with the origin of the coordinate system in the upper left corner. This is
* different from the PDF default coordinate system, which has the origin in the
* lower left corner. If you want to use the PDF default coordinates, set the
* variable USE_ACROBAT_COORDINATES to false. You can visualize page coordinates
* in Acrobat as follows:
* - To display cursor coordinates in Acrobat DC use
* View, Show/Hide, Cursor Coordinates.
* - The coordinates are displayed in the unit which is currently selected in
* Acrobat. To change the display units to points (as used in TET) in
* Acrobat DC proceed as follows: go to Edit, Preferences, Units &
* Guides, Units and select Points.
*
* Required software: TET 5.2
*
* Required data: PDF document
*
*/
package com.pdflib.cookbook.tet.font;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeSet;
import com.pdflib.TET;
import com.pdflib.TETException;
public class font_finder {
/**
* Global option list. The program expects the "resource" directory parallel to
* the "java" directory.
*/
private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}";
/**
* Document specific option list.
*/
private static final String DOC_OPTLIST = "";
/**
* Page-specific option list.
*/
private static final String PAGE_OPTLIST = "granularity=page";
/**
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");
/**
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
/**
* Command line flag for fonts to ignore.
*/
private static final String IGNORE_OPT = "-ignorefonts";
/**
* Command line flag for fonts to include.
*/
private final static String INCLUDE_OPT = "-includefonts";
/**
* Maximum length of text to print out for a text chunk, if file names are
* prepended.
*/
private final static int MAX_TEXT_LENGTH_MULTI_FILE = 25;
/**
* Maximum length of text to print out for a text chunk, if file names are noz
* prepended.
*/
private final static int MAX_TEXT_LENGTH_SINGLE_FILE = 40;
/**
* Use the Acrobat coordinate system with the origin in the upper right corner,
* or the PDF default coordinate system in the lower left corner.
*/
private static final boolean USE_ACROBAT_COORDINATES = true;
/**
* Fonts to include in the output. If it is null, all fonts are included.
*/
private Set<String> includedFonts;
/**
* Fonts to exclude from the output. If it is null, no fonts are ignored.
*/
private Set<String> ignoredFonts;
/**
* Name of the input file.
*/
private String filename;
/**
* The format for printing the x and y coordinate values.
*/
private NumberFormat coordFormat;
/**
* Print the filename in each line. Intended for invocations with more than one
* input file.
*/
private boolean prependFilenames;
/**
* Unicode code point for ARABIC TATWEEL character.
*/
private static final int U_ARABIC_TATWEEL = 0x640;
/**
* @param filename The name of the input document.
* @param fontsToInclude Set of fonts to include in the output (may be null).
* @param fontsToIgnore Set of fonts to exclude from the output (may be
* null).
* @param prependFilenames Prepend the filename in each line.
*/
private font_finder(String filename, Set<String> fontsToInclude, Set<String> fontsToIgnore,
boolean prependFilenames) {
this.filename = filename;
this.includedFonts = fontsToInclude;
this.ignoredFonts = fontsToIgnore;
this.prependFilenames = prependFilenames;
this.coordFormat = NumberFormat.getInstance();
coordFormat.setMinimumFractionDigits(0);
coordFormat.setMaximumFractionDigits(2);
}
/**
* Run the actual font finder algorithm.
*/
private void execute() {
TET tet = null;
int pageno = 0;
try {
tet = new TET();
tet.set_option(GLOBAL_OPTLIST);
final int doc = tet.open_document(filename, DOC_OPTLIST);
if (doc == -1) {
System.err
.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
} else {
/*
* Loop over pages in the document
*/
final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
for (pageno = 1; pageno <= n_pages; ++pageno) {
process_page(tet, doc, pageno);
}
tet.close_document(doc);
}
} catch (TETException e) {
if (pageno == 0) {
System.err
.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
} else {
System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": "
+ e.get_errmsg() + "\n");
}
System.exit(1);
} finally {
tet.delete();
}
}
/**
* Extract text from page and identify all the contiguous chunks that use the
* same font.
*
* @param tet TET object
* @param doc TET document handle
* @param pageno Page to process
*
* @throws TETException An error occurred in the TET API
*/
private void process_page(TET tet, final int doc, int pageno) throws TETException {
final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);
if (page == -1) {
System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
} else {
/*
* Retrieve the text from the whole page and split it in contiguous chunks of
* text that use the same font.
*/
for (String text = tet.get_text(page); text != null; text = tet.get_text(page)) {
process_char_info(tet, doc, pageno, page, text);
}
if (tet.get_errnum() != 0) {
System.err
.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
}
tet.close_page(page);
}
}
/**
* Process the character information for the given page, and print out the
* results.
*
* @param tet TET object
* @param doc TET document handle.
* @param pageno Page number
* @param page TET page handle
* @param text The text of the page
*
* @throws TETException
*/
private void process_char_info(TET tet, int doc, int pageno, int page, String text) throws TETException {
int currentFontId = -1;
double xPos = 0;
double yPos = 0;
/*
* Get the page height for transforming the coordinates to Acrobat's coordinate
* system.
*/
final double pageHeight = tet.pcos_get_number(doc, "pages[" + (pageno - 1) + "]/height");
StringBuffer chunk = new StringBuffer();
int ci = tet.get_char_info(page);
while (ci != -1) {
/*
* Under certain conditions get_char_info() returns information about a
* character that can be ignored:
*
* - Unicode character ARABIC TATWEEL
* - control characters
* - unmappable glyphs
* - hyphens removed by dehyphenation
*
* In these cases the character must not be counted.
*/
if (tet.uv != U_ARABIC_TATWEEL && !Character.isISOControl(tet.uv) && !tet.unknown
&& (tet.attributes & TET.ATTR_DEHYPHENATION_ARTIFACT) == 0) {
if (tet.fontid != currentFontId) {
if (currentFontId != -1) {
/* Print information about the finished chunk */
print_chunk_info(tet, doc, pageno, chunk.toString(), currentFontId, xPos, yPos, pageHeight);
}
currentFontId = tet.fontid;
xPos = tet.x;
yPos = tet.y;
chunk = new StringBuffer();
}
/* Insert Unicode code point into the current chunk. */
chunk.append(Character.toChars(tet.uv));
}
ci = tet.get_char_info(page);
}
/* Print information for final chunk */
if (currentFontId != -1) {
print_chunk_info(tet, doc, pageno, chunk.toString(), currentFontId, xPos, yPos, pageHeight);
}
}
/**
* Print information about a chunk of text that has the same font.
*
* @param tet TET object
* @param doc TET document handle.
* @param pageno Page number
* @param chunk The current text chunk that has the same font assigned
* @param currentFontId pCOS id of the current font
* @param xPos x position of chunk
* @param yPos y position of chunk
* @param pageHeight height of page
*
* @throws TETException
*/
private void print_chunk_info(TET tet, int doc, int pageno, String chunk, int currentFontId, double xPos,
double yPos, double pageHeight) throws TETException {
// Output information for current chunk
String fontName = tet.pcos_get_string(doc, "fonts[" + currentFontId + "]/name");
if (includeFontInOutput(fontName)) {
if (USE_ACROBAT_COORDINATES) {
yPos = pageHeight - yPos;
}
/*
* Only print filename if there is more than one file name given on the command
* line.
*/
if (prependFilenames) {
out.print(filename + ", ");
}
out.print("page " + pageno);
out.print(" at (" + coordFormat.format(xPos) + " " + coordFormat.format(yPos) + "), ");
out.print("font " + fontName + ": ");
int displayLength = Math.min(prependFilenames ? MAX_TEXT_LENGTH_MULTI_FILE : MAX_TEXT_LENGTH_SINGLE_FILE,
chunk.length());
/*
* Avoid splitting a surrogate pair: If the Unicode code point is beyond the
* Basic Multilingual Plane (BMP), add another Unicode code unit.
*/
if (chunk.codePointAt(displayLength - 1) > 0xFFFF) {
displayLength += 1;
}
out.print(chunk.substring(0, displayLength));
if (chunk.length() > displayLength) {
out.print("...");
}
out.println();
}
}
/**
* Whether to include the font in the output.
*
* @param fontName The name of the font to check
*
* @return true if the font has to be included in the output, otherwise false
*/
private boolean includeFontInOutput(String fontName) {
return (includedFonts == null && ignoredFonts == null)
|| (includedFonts != null && includedFonts.contains(fontName))
|| (ignoredFonts != null && !ignoredFonts.contains(fontName));
}
/**
* Prints out a font set as a comma-separated list.
*
* @param fonts A set of fonts to print as a list.
*/
private static void print_font_list(Set<String> fonts) {
Iterator<String> i = fonts.iterator();
int pos = 0;
while (i.hasNext()) {
if (pos > 0) {
out.print(", ");
}
String fontName = (String) i.next();
out.print(fontName);
}
}
/**
* Splits the list of font names and generates a Set of font names from them.
*
* @param fontList A comma-separated list of font names.
*
* @return A Set containing the elements of the font list
*/
private static Set<String> parse_font_list(String fontList) {
Set<String> retval = new TreeSet<String>();
StringTokenizer tokenizer = new StringTokenizer(fontList, ",");
while (tokenizer.hasMoreTokens()) {
retval.add(tokenizer.nextToken());
}
return retval;
}
/**
* Main program
*
* @param args command line arguments
*
* @throws UnsupportedEncodingException Unsupported encoding specified for
* System.out
*/
public static void main(String[] args) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
Set<String> fontsToInclude = null;
Set<String> fontsToIgnore = null;
int i;
for (i = 0; i < args.length; i += 1) {
if (args[i].equals(IGNORE_OPT)) {
i += 1;
if (i < args.length && fontsToIgnore == null && fontsToInclude == null) {
fontsToIgnore = parse_font_list(args[i]);
} else {
usage();
}
} else if (args[i].equals(INCLUDE_OPT)) {
i += 1;
if (i < args.length && fontsToIgnore == null && fontsToInclude == null) {
fontsToInclude = parse_font_list(args[i]);
} else {
usage();
}
} else {
break;
}
}
// at least one item must be left as the input file
if (i < args.length) {
/*
* Header describing the included and excluded fonts.
*/
out.print("included fonts: ");
if (fontsToInclude == null) {
out.print("all except ignored fonts");
} else {
print_font_list(fontsToInclude);
}
out.println();
out.print("ignored fonts: ");
if (fontsToIgnore == null) {
out.print("none");
} else {
print_font_list(fontsToIgnore);
}
out.println();
/*
* Only prepend input filenames to each line if there is more than one input
* file.
*/
boolean printFilenames = args.length - i > 1;
for (; i < args.length; i += 1) {
font_finder f = new font_finder(args[i], fontsToInclude, fontsToIgnore, printFilenames);
f.execute();
}
} else {
usage();
}
}
private static void usage() {
System.err.println(
"usage: font_finder [ -ignorefonts <font list> | " + " -includefonts <font list> ] <PDF document> ...");
System.exit(1);
}
}