PDFlib Cookbook

cookbook

pdfua/scan_with_ocr_pdfua1

Tagging for invisible OCR text which accompanies a scanned page.

Download Java Code  Switch to PHP Code  Show Output 

/*
 * Tagging for invisible OCR text which accompanies a scanned page 
 *  
 * Place an image and create invisible text on top of it with the
 * "textrendering" parameter set to 3.  The most common scenario for this is
 * "scanned page with invisible OCR text" (which has been retrieved from the
 * scanned page in an earlier step with OCR).
 * Suitable tags are created for the invisible text, while the scanned page
 * is tagged as Artifact.
 *
 * Required software: PDFlib/PDFlib+PDI/PPS 10
 * Required data: image file
 */
package com.pdflib.cookbook.pdflib.pdfua;

import com.pdflib.pdflib;
import com.pdflib.PDFlibException;

public class scan_with_ocr_pdfua1 {
    public static void main(String argv[]) {
        /* This is where the data files are. Adjust as necessary. */
        String searchpath = "../input";
        String outfile = "scan_with_ocr_pdfua1.pdf";
        String title = "Tagged scan with OCR text";

        pdflib p = null;

        String imagefile = "multi_page.tif";
        int font, image, id;
        int exitcode = 0;

        try {
            p = new pdflib();

            p.set_option("searchpath={" + searchpath + "}");

            /* This means we must check return values of load_font() etc. */
            p.set_option("errorpolicy=return");

            if (p.begin_document(outfile,
                "pdfua=PDF/UA-1 lang=en tag={tagname=Document}") == -1)
                throw new Exception("Error: " + p.get_errmsg());

            p.set_info("Creator", "PDFlib Cookbook");
            p.set_info("Title", title);
            
            p.set_option("autospace=true");
            
            p.create_bookmark("Scanned page with OCR text", "");

            font = p.load_font("NotoSerif-Regular", "unicode", "");
            if (font == -1)
                throw new Exception("Error: " + p.get_errmsg());

            /* Load the image */
            image = p.load_image("auto", imagefile, "page=1");
            if (image == -1)
                throw new Exception("Error: " + p.get_errmsg());

            /* Start page */
            p.begin_page_ext(0, 0, "width=a4.width height=a4.height");

            /* Place the scan and tag it as Artifact */
            p.fit_image(image, 0, 0,
                "boxsize={595 842} fitmethod=meet tag={tagname=Artifact}");
            p.close_image(image);

            /* Set the text rendering mode to "invisible text" */
            p.set_text_option("textrendering=3");

            /*
             * Output the text invisibly on top of the image with the rendering
             * mode set to "invisible text" above. The following text
             * resembles text retrieved from the scanned page via OCR.
             */
            id = p.begin_item("P", "");
                p.setfont(font, 19);
                p.fit_textline("PDFlib GmbH M\u00fcnchen, Germany", 130, 750, "");
                p.fit_textline("www.pdflib.com", 215, 710, "");
            p.end_item(id);

            id = p.begin_item("P", "");
                p.setfont(font, 26);
                p.fit_textline("Tutorial for", 120, 477, "");
                p.fit_textline("PDFlib, PDI, and PPS", 120, 440, "");
            p.end_item(id);

            id = p.begin_item("P", "");
                p.fit_textline("A library for generating PDF on the fly",
                    118, 312, "fontsize=20");
                p.fit_textline("Version 7.0.1", 253, 272, "fontsize=36");
            p.end_item(id);

            id = p.begin_item("P", "");
                p.setfont(font, 19);
                p.fit_textline("General Edition for", 195, 120, "");
                p.fit_textline("Cobol, C, C++, Java, Perl", 165, 94, "");
                p.fit_textline("PHP, Phyton, RPG, Ruby, and Tcl", 140, 68, "");
            p.end_item(id);


            p.end_page_ext("");

            p.end_document("");
        }
        catch (PDFlibException e) {
            System.err.println("PDFlib exception occurred:");
            System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() +
                ": " + e.get_errmsg());
            exitcode = 1;
        }
        catch (Exception e) {
            System.err.println(e);
            exitcode = 1;
        }
        finally {
            if (p != null) {
                p.delete();
            }
            System.exit(exitcode);
        }
    }
}