BASKET
Search
PDFlib

pdfua/tagged_scan_with_ocr_text

Download Java Code    Switch to PHP Code      Show Output PDF

/* $Id: tagged_scan_with_ocr_text.java,v 1.2 2014/01/30 14:05:28 rjs Exp $

 * Tagging for invisible OCR text which accompanies a scanned page

 * 

 * Place an image and create invisible text on top of it with the

 * "textrendering" parameter set to 3.  The most common scenario for this is

 * "scanned page with invisible OCR text" (which has been retrieved from the

 * scanned page in an earlier step with OCR).

 * Suitable tags are created for the invisible text, while the scanned page

 * is tagged as Artifact.

 *

 * Required software: PDFlib/PDFlib+PDI/PPS 9

 * Required data: image file

 */

package com.pdflib.cookbook.pdflib.pdfua;


import com.pdflib.pdflib;

import com.pdflib.PDFlibException;


public class tagged_scan_with_ocr_text {

    public static void main(String argv[]) {

        /* This is where the data files are. Adjust as necessary. */

        String searchpath = "../input";

        String outfile = "tagged_scan_with_ocr_text.pdf";

        String title = "Tagged scan with OCR text";


        pdflib p = null;


        String imagefile = "multi_page.tif";

        int font, image, id;

        int exitcode = 0;


        try {

            p = new pdflib();


            p.set_option("searchpath={" + searchpath + "}");


            /* This means we must check return values of load_font() etc. */

            p.set_option("errorpolicy=return");


            if (p.begin_document(outfile,

                "pdfua=PDF/UA-1 lang=en tag={tagname=Document}") == -1)

                throw new Exception("Error: " + p.get_errmsg());


            p.set_info("Creator", "PDFlib Cookbook");

            p.set_info("Title", title + " $Revision: 1.2 $");

           

            p.set_option("autospace=true");

           

            p.create_bookmark("Scanned page with OCR text", "");


            font = p.load_font("DejaVuSerif", "unicode", "embedding");

            if (font == -1)

                throw new Exception("Error: " + p.get_errmsg());


            /* Load the image */

            image = p.load_image("auto", imagefile, "page=1");

            if (image == -1)

                throw new Exception("Error: " + p.get_errmsg());


            /* Start page */

            p.begin_page_ext(0, 0, "width=a4.width height=a4.height");


            /* Place the scan and tag it as Artifact */

            p.fit_image(image, 0, 0,

                "boxsize={595 842} fitmethod=meet tag={tagname=Artifact}");

            p.close_image(image);


            /* This structure element contains all of the OCR text */


            id = p.begin_item("P", "");

            /* Save the current graphics state */

            p.save();


            /* Set the text rendering mode to "invisible text" */

            p.set_text_option("textrendering=3");


            /*

             * Output the text invisibly on top of the image with the rendering

             * mode set to "invisible text" above. The following text

             * resembles text retrieved from the scanned page via OCR.

             */

            p.setfont(font, 19);

            p.fit_textline("PDFlib GmbH M\u00fcnchen, Germany", 130, 750, "");

            p.fit_textline("www.pdflib.com", 215, 710, "");


            p.setfont(font, 26);

            p.fit_textline("Tutorial for", 120, 477, "");

            p.fit_textline("PDFlib, PDI, and PPS", 120, 440, "");

           

            p.fit_textline("A library for generating PDF on the fly",

                118, 312, "fontsize=20");

            p.fit_textline("Version 7.0.1", 253, 272, "fontsize=36");


            p.setfont(font, 19);

            p.fit_textline("General Edition for", 195, 120, "");

            p.fit_textline("Cobol, C, C++, Java, Perl", 165, 94, "");

            p.fit_textline("PHP, Phyton, RPG, Ruby, and Tcl", 140, 68, "");


            /* Restore the current graphics state */

            p.restore();

           

            p.end_item(id);


            p.end_page_ext("");


            p.end_document("");

        }

        catch (PDFlibException e) {

            System.err.print("PDFlib exception occurred:\n");

            System.err.print("[" + e.get_errnum() + "] " + e.get_apiname()

                + ": " + e.get_errmsg() + "\n");

            exitcode = 1;

        }

        catch (Exception e) {

            System.err.println(e.getMessage());

            exitcode = 1;

        }

        finally {

            if (p != null) {

                p.delete();

            }

            System.exit(exitcode);

        }

    }

}