PDFlib Cookbook

cookbook

pdfua/scan_with_ocr_pdfua1

Tagging for invisible OCR text which accompanies a scanned page.

Download PHP Code  Switch to Java Code  Show Output 

<?php
/*
 * Tagging for invisible OCR text which accompanies a scanned page 
 *  
 * Place an image and create invisible text on top of it with the
 * "textrendering" parameter set to 3.  The most common scenario for this is
 * "scanned page with invisible OCR text" (which has been retrieved from the
 * scanned page in an earlier step with OCR).
 * Suitable tags are created for the invisible text, while the scanned page
 * is tagged as Artifact.
 *
 * Required software: PDFlib/PDFlib+PDI/PPS 10
 * Required data: image file
 */

/* This is where the data files are. Adjust as necessary. */
$searchpath = dirname(__FILE__,3)."/input";

$outfile = "";
$title = "Tagged scan with OCR text";

$p = null;

$imagefile = "multi_page.tif";

try {
    $p = new pdflib();

    $p->set_option("searchpath={" . $searchpath . "}");

    /* This means we must check return values of load_font() etc. */
    $p->set_option("errorpolicy=return");
    

    if ($p->begin_document($outfile,
        "pdfua=PDF/UA-1 lang=en tag={tagname=Document}") == 0)
        throw new Exception("Error: " . $p->get_errmsg());

    $p->set_info("Creator", "PDFlib Cookbook");
    $p->set_info("Title", $title);
    
    $p->set_option("autospace=true");
    
    $p->create_bookmark("Scanned page with OCR text", "");

    $font = $p->load_font("NotoSerif-Regular", "unicode", "");
    if ($font == 0)
        throw new Exception("Error: " . $p->get_errmsg());

    /* Load the $image */
    $image = $p->load_image("auto", $imagefile, "page=1");
    if ($image == 0)
        throw new Exception("Error: " . $p->get_errmsg());

    /* Start page */
    $p->begin_page_ext(0, 0, "width=a4.width height=a4.height");

    /* Place the scan and tag it as Artifact */
    $p->fit_image($image, 0, 0,
        "boxsize={595 842} fitmethod=meet tag={tagname=Artifact}");
    $p->close_image($image);

    /* Set the text rendering mode to "invisible text" */
    $p->set_text_option("textrendering=3");

    /*
     * Output the text invisibly on top of the $image with the rendering
     * mode set to "invisible text" above. The following text
     * resembles text retrieved from the scanned page via OCR.
     */
    $id = $p->begin_item("P", "");
        $p->setfont($font, 19);
        $p->fit_textline("PDFlib GmbH M\u{00fc}nchen, Germany", 130, 750, "");
        $p->fit_textline("www.pdflib.com", 215, 710, "");
    $p->end_item($id);

    $id = $p->begin_item("P", "");
        $p->setfont($font, 26);
        $p->fit_textline("Tutorial for", 120, 477, "");
        $p->fit_textline("PDFlib, PDI, and PPS", 120, 440, "");
    $p->end_item($id);
     
    $id = $p->begin_item("P", "");
        $p->fit_textline("A library for generating PDF on the fly",
            118, 312, "fontsize=20");
        $p->fit_textline("Version 7.0.1", 253, 272, "fontsize=36");
    $p->end_item($id);

    $id = $p->begin_item("P", "");
        $p->setfont($font, 19);
        $p->fit_textline("General Edition for", 195, 120, "");
        $p->fit_textline("Cobol, C, C++, Java, Perl", 165, 94, "");
        $p->fit_textline("PHP, Phyton, RPG, Ruby, and Tcl", 140, 68, "");
    $p->end_item($id);

    $p->end_page_ext("");

    $p->end_document("");
    $buf = $p->get_buffer();
    $len = strlen($buf);

    header("Content-type: application/pdf");
    header("Content-Length: $len");
    header("Content-Disposition: inline; filename=scan_with_ocr_pdfua1.pdf");
    print $buf;
}
catch (PDFlibException $e) {
    echo("PDFlib exception occurred in scan_with_ocr_pdfua1 sample:\n" .
        "[" . $e->get_errnum() . "] " . $e->get_apiname() . ": " .
        $e->get_errmsg() . "\n");
    exit(1);
}
catch (Throwable $e) {
    echo($e);
    exit(1);
}

$p = 0;
?>