PDFlib Cookbook

cookbook

pdfua/tagged_scan_with_ocr_text

Download Java Code    Switch to PHP Code      Show Output PDF

/* $Id: tagged_scan_with_ocr_text.java,v 1.2 2014/01/30 14:05:28 rjs Exp $
 * Tagging for invisible OCR text which accompanies a scanned page 
 *  
 * Place an image and create invisible text on top of it with the
 * "textrendering" parameter set to 3.  The most common scenario for this is
 * "scanned page with invisible OCR text" (which has been retrieved from the
 * scanned page in an earlier step with OCR).
 * Suitable tags are created for the invisible text, while the scanned page
 * is tagged as Artifact.
 *
 * Required software: PDFlib/PDFlib+PDI/PPS 9
 * Required data: image file
 */
package com.pdflib.cookbook.pdflib.pdfua;

import com.pdflib.pdflib;
import com.pdflib.PDFlibException;

public class tagged_scan_with_ocr_text {
    public static void main(String argv[]) {
        /* This is where the data files are. Adjust as necessary. */
        String searchpath = "../input";
        String outfile = "tagged_scan_with_ocr_text.pdf";
        String title = "Tagged scan with OCR text";

        pdflib p = null;

        String imagefile = "multi_page.tif";
        int font, image, id;
	int exitcode = 0;

        try {
            p = new pdflib();

            p.set_option("searchpath={" + searchpath + "}");

            /* This means we must check return values of load_font() etc. */
            p.set_option("errorpolicy=return");

            if (p.begin_document(outfile,
                "pdfua=PDF/UA-1 lang=en tag={tagname=Document}") == -1)
                throw new Exception("Error: " + p.get_errmsg());

            p.set_info("Creator", "PDFlib Cookbook");
            p.set_info("Title", title + " $Revision: 1.2 $");
            
            p.set_option("autospace=true");
            
            p.create_bookmark("Scanned page with OCR text", "");

            font = p.load_font("DejaVuSerif", "unicode", "embedding");
            if (font == -1)
                throw new Exception("Error: " + p.get_errmsg());

            /* Load the image */
            image = p.load_image("auto", imagefile, "page=1");
            if (image == -1)
                throw new Exception("Error: " + p.get_errmsg());

            /* Start page */
            p.begin_page_ext(0, 0, "width=a4.width height=a4.height");

            /* Place the scan and tag it as Artifact */
            p.fit_image(image, 0, 0,
                "boxsize={595 842} fitmethod=meet tag={tagname=Artifact}");
            p.close_image(image);

            /* This structure element contains all of the OCR text */

            id = p.begin_item("P", "");
            /* Save the current graphics state */
            p.save();

            /* Set the text rendering mode to "invisible text" */
            p.set_text_option("textrendering=3");

            /*
             * Output the text invisibly on top of the image with the rendering
             * mode set to "invisible text" above. The following text
             * resembles text retrieved from the scanned page via OCR.
             */
            p.setfont(font, 19);
            p.fit_textline("PDFlib GmbH M\u00fcnchen, Germany", 130, 750, "");
            p.fit_textline("www.pdflib.com", 215, 710, "");

            p.setfont(font, 26);
            p.fit_textline("Tutorial for", 120, 477, "");
            p.fit_textline("PDFlib, PDI, and PPS", 120, 440, "");
            
            p.fit_textline("A library for generating PDF on the fly",
                118, 312, "fontsize=20");
            p.fit_textline("Version 7.0.1", 253, 272, "fontsize=36");

            p.setfont(font, 19);
            p.fit_textline("General Edition for", 195, 120, "");
            p.fit_textline("Cobol, C, C++, Java, Perl", 165, 94, "");
            p.fit_textline("PHP, Phyton, RPG, Ruby, and Tcl", 140, 68, "");

            /* Restore the current graphics state */
            p.restore();
            
            p.end_item(id);

            p.end_page_ext("");

            p.end_document("");
        }
        catch (PDFlibException e) {
            System.err.print("PDFlib exception occurred:\n");
            System.err.print("[" + e.get_errnum() + "] " + e.get_apiname()
                + ": " + e.get_errmsg() + "\n");
	    exitcode = 1;
        }
        catch (Exception e) {
            System.err.println(e.getMessage());
	    exitcode = 1;
        }
        finally {
            if (p != null) {
                p.delete();
            }
	    System.exit(exitcode);
        }
    }
}