text/back_of_the_book_index
Create a sorted list of all words in the document along with the page numbers where the words occur.
Download Java Code Show Output Show Input (FontReporter.pdf)
/*
* Create a sorted list of all words in the document along with the page numbers
* where the words occur.
*
* Note that the index is limited to words starting with characters [A-Za-z], as
* the demonstration program lacks the features that would be necessary for
* making it a truly internationalized index program.
*
* Required software: TET 5
*
* Required data: PDF document
*
*/
package com.pdflib.cookbook.tet.text;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.text.Collator;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import com.pdflib.TET;
import com.pdflib.TETException;
class back_of_the_book_index {
/*
* Global option list. The program expects the "resource" directory parallel to
* the "java" directory.
*/
private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}";
/*
* Document specific option list.
*/
private static final String DOC_OPTLIST = "";
/*
* Page-specific option list.
*/
private static final String PAGE_OPTLIST = "granularity=word";
/*
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");
/*
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
/*
* A word must start with one of the characters in this string to be included in
* the index (case doesn't matter).
*/
private static final String INCLUDE_CHARS = "abcdefghijklmnopqrstuvwxyz";
/*
* Set this to true if all words are to be lowercased.
*/
private static final boolean LOWERCASE_WORDS = false;
/*
* The name of the file to process.
*/
private String filename;
/*
* A map of sets. The map key is the word, the value is a set of page numbers.
* For the page set a LinkedHashSet is used, as we traverse the document in page
* order, and the LinkedHashSet preserves the insertion order, which will give
* us the desired sorted list of page numbers.
*/
private Map<String, Set<Integer>> wordPages = new HashMap<String, Set<Integer>>();
/*
* Process a single page of text.
*
* @param tet TET object
* @param doc TET document handle
* @param pageno Page to process
*
* @throws TETException An error occurred in the TET API
*/
private void process_page(TET tet, final int doc, int pageno) throws TETException {
final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);
if (page == -1) {
System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
} else {
/*
* Fetch the text word-wise
*/
for (String text = tet.get_text(page); text != null; text = tet.get_text(page)) {
/*
* Only include words that start with a letter out of the set of interesting
* characters.
*/
if (INCLUDE_CHARS.indexOf(Character.toLowerCase(text.charAt(0))) != -1) {
if (LOWERCASE_WORDS) {
text = text.toLowerCase();
}
Set<Integer> pages = wordPages.get(text);
if (pages == null) {
pages = new LinkedHashSet<Integer>();
wordPages.put(text, pages);
}
pages.add(Integer.valueOf(pageno));
}
}
if (tet.get_errnum() != 0) {
System.err
.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
}
tet.close_page(page);
}
}
/*
* Print out the results.
*
* @throws TETException
*/
private void print_index(TET tet, int doc) throws TETException {
out.println("Alphabetical list of words in the document \"" + filename + "\" along with their page number:");
out.println();
String[] words = new String[wordPages.size()];
words = wordPages.keySet().toArray(words);
/*
* Sort according to the sorting rules of the default locale.
*/
final Collator collator = Collator.getInstance();
Arrays.sort(words, new Comparator<Object>() {
public int compare(Object o1, Object o2) {
return collator.compare(o1, o2);
}
});
char currentGroup = 0;
/*
* Print out the words with the pages they appear on, grouped by first letter.
*/
for (int i = 0; i < words.length; i += 1) {
String word = words[i];
char firstChar = Character.toUpperCase(word.charAt(0));
if (firstChar != currentGroup) {
out.println(firstChar);
currentGroup = firstChar;
}
out.print(word + " ");
Set<Integer> pages = wordPages.get(word);
Iterator<Integer> j = pages.iterator();
boolean first = true;
while (j.hasNext()) {
if (!first) {
out.print(", ");
} else {
first = false;
}
out.print(j.next());
}
out.println();
}
}
/*
* Generate the index for the given file.
*/
private void execute() {
TET tet = null;
int pageno = 0;
try {
tet = new TET();
tet.set_option(GLOBAL_OPTLIST);
final int doc = tet.open_document(filename, DOC_OPTLIST);
if (doc == -1) {
System.err
.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
} else {
/*
* Loop over pages in the document
*/
final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
for (pageno = 1; pageno <= n_pages; ++pageno) {
process_page(tet, doc, pageno);
}
print_index(tet, doc);
tet.close_document(doc);
}
} catch (TETException e) {
if (pageno == 0) {
System.err
.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
} else {
System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": "
+ e.get_errmsg() + "\n");
}
System.exit(1);
} finally {
tet.delete();
}
}
/*
* @param filename the name of the file for which the concordance will be
* generated
*/
private back_of_the_book_index(String filename) {
this.filename = filename;
}
public static void main(String[] args) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
if (args.length != 1) {
out.println("usage: back_of_the_book_index <infilename>");
return;
}
back_of_the_book_index c = new back_of_the_book_index(args[0]);
c.execute();
}
}