pdfua/scan_with_ocr_pdfua1
Tagging for invisible OCR text which accompanies a scanned page.
Download Java Code Switch to PHP Code Show Output
/*
* Tagging for invisible OCR text which accompanies a scanned page
*
* Place an image and create invisible text on top of it with the
* "textrendering" parameter set to 3. The most common scenario for this is
* "scanned page with invisible OCR text" (which has been retrieved from the
* scanned page in an earlier step with OCR).
* Suitable tags are created for the invisible text, while the scanned page
* is tagged as Artifact.
*
* Required software: PDFlib/PDFlib+PDI/PPS 10
* Required data: image file
*/
package com.pdflib.cookbook.pdflib.pdfua;
import com.pdflib.pdflib;
import com.pdflib.PDFlibException;
public class scan_with_ocr_pdfua1 {
public static void main(String argv[]) {
/* This is where the data files are. Adjust as necessary. */
String searchpath = "../input";
String outfile = "scan_with_ocr_pdfua1.pdf";
String title = "Tagged scan with OCR text";
pdflib p = null;
String imagefile = "multi_page.tif";
int font, image, id;
int exitcode = 0;
try {
p = new pdflib();
p.set_option("searchpath={" + searchpath + "}");
/* This means we must check return values of load_font() etc. */
p.set_option("errorpolicy=return");
if (p.begin_document(outfile,
"pdfua=PDF/UA-1 lang=en tag={tagname=Document}") == -1)
throw new Exception("Error: " + p.get_errmsg());
p.set_info("Creator", "PDFlib Cookbook");
p.set_info("Title", title);
p.set_option("autospace=true");
p.create_bookmark("Scanned page with OCR text", "");
font = p.load_font("NotoSerif-Regular", "unicode", "");
if (font == -1)
throw new Exception("Error: " + p.get_errmsg());
/* Load the image */
image = p.load_image("auto", imagefile, "page=1");
if (image == -1)
throw new Exception("Error: " + p.get_errmsg());
/* Start page */
p.begin_page_ext(0, 0, "width=a4.width height=a4.height");
/* Place the scan and tag it as Artifact */
p.fit_image(image, 0, 0,
"boxsize={595 842} fitmethod=meet tag={tagname=Artifact}");
p.close_image(image);
/* Set the text rendering mode to "invisible text" */
p.set_text_option("textrendering=3");
/*
* Output the text invisibly on top of the image with the rendering
* mode set to "invisible text" above. The following text
* resembles text retrieved from the scanned page via OCR.
*/
id = p.begin_item("P", "");
p.setfont(font, 19);
p.fit_textline("PDFlib GmbH M\u00fcnchen, Germany", 130, 750, "");
p.fit_textline("www.pdflib.com", 215, 710, "");
p.end_item(id);
id = p.begin_item("P", "");
p.setfont(font, 26);
p.fit_textline("Tutorial for", 120, 477, "");
p.fit_textline("PDFlib, PDI, and PPS", 120, 440, "");
p.end_item(id);
id = p.begin_item("P", "");
p.fit_textline("A library for generating PDF on the fly",
118, 312, "fontsize=20");
p.fit_textline("Version 7.0.1", 253, 272, "fontsize=36");
p.end_item(id);
id = p.begin_item("P", "");
p.setfont(font, 19);
p.fit_textline("General Edition for", 195, 120, "");
p.fit_textline("Cobol, C, C++, Java, Perl", 165, 94, "");
p.fit_textline("PHP, Phyton, RPG, Ruby, and Tcl", 140, 68, "");
p.end_item(id);
p.end_page_ext("");
p.end_document("");
}
catch (PDFlibException e) {
System.err.println("PDFlib exception occurred:");
System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() +
": " + e.get_errmsg());
exitcode = 1;
}
catch (Exception e) {
System.err.println(e);
exitcode = 1;
}
finally {
if (p != null) {
p.delete();
}
System.exit(exitcode);
}
}
}