pdfua/scan_with_ocr_pdfua1
Tagging for invisible OCR text which accompanies a scanned page.
Download PHP Code Switch to Java Code Show Output
<?php
/*
* Tagging for invisible OCR text which accompanies a scanned page
*
* Place an image and create invisible text on top of it with the
* "textrendering" parameter set to 3. The most common scenario for this is
* "scanned page with invisible OCR text" (which has been retrieved from the
* scanned page in an earlier step with OCR).
* Suitable tags are created for the invisible text, while the scanned page
* is tagged as Artifact.
*
* Required software: PDFlib/PDFlib+PDI/PPS 10
* Required data: image file
*/
/* This is where the data files are. Adjust as necessary. */
$searchpath = dirname(__FILE__,3)."/input";
$outfile = "";
$title = "Tagged scan with OCR text";
$p = null;
$imagefile = "multi_page.tif";
try {
$p = new pdflib();
$p->set_option("searchpath={" . $searchpath . "}");
/* This means we must check return values of load_font() etc. */
$p->set_option("errorpolicy=return");
if ($p->begin_document($outfile,
"pdfua=PDF/UA-1 lang=en tag={tagname=Document}") == 0)
throw new Exception("Error: " . $p->get_errmsg());
$p->set_info("Creator", "PDFlib Cookbook");
$p->set_info("Title", $title);
$p->set_option("autospace=true");
$p->create_bookmark("Scanned page with OCR text", "");
$font = $p->load_font("NotoSerif-Regular", "unicode", "");
if ($font == 0)
throw new Exception("Error: " . $p->get_errmsg());
/* Load the $image */
$image = $p->load_image("auto", $imagefile, "page=1");
if ($image == 0)
throw new Exception("Error: " . $p->get_errmsg());
/* Start page */
$p->begin_page_ext(0, 0, "width=a4.width height=a4.height");
/* Place the scan and tag it as Artifact */
$p->fit_image($image, 0, 0,
"boxsize={595 842} fitmethod=meet tag={tagname=Artifact}");
$p->close_image($image);
/* Set the text rendering mode to "invisible text" */
$p->set_text_option("textrendering=3");
/*
* Output the text invisibly on top of the $image with the rendering
* mode set to "invisible text" above. The following text
* resembles text retrieved from the scanned page via OCR.
*/
$id = $p->begin_item("P", "");
$p->setfont($font, 19);
$p->fit_textline("PDFlib GmbH M\u{00fc}nchen, Germany", 130, 750, "");
$p->fit_textline("www.pdflib.com", 215, 710, "");
$p->end_item($id);
$id = $p->begin_item("P", "");
$p->setfont($font, 26);
$p->fit_textline("Tutorial for", 120, 477, "");
$p->fit_textline("PDFlib, PDI, and PPS", 120, 440, "");
$p->end_item($id);
$id = $p->begin_item("P", "");
$p->fit_textline("A library for generating PDF on the fly",
118, 312, "fontsize=20");
$p->fit_textline("Version 7.0.1", 253, 272, "fontsize=36");
$p->end_item($id);
$id = $p->begin_item("P", "");
$p->setfont($font, 19);
$p->fit_textline("General Edition for", 195, 120, "");
$p->fit_textline("Cobol, C, C++, Java, Perl", 165, 94, "");
$p->fit_textline("PHP, Phyton, RPG, Ruby, and Tcl", 140, 68, "");
$p->end_item($id);
$p->end_page_ext("");
$p->end_document("");
$buf = $p->get_buffer();
$len = strlen($buf);
header("Content-type: application/pdf");
header("Content-Length: $len");
header("Content-Disposition: inline; filename=scan_with_ocr_pdfua1.pdf");
print $buf;
}
catch (PDFlibException $e) {
echo("PDFlib exception occurred in scan_with_ocr_pdfua1 sample:\n" .
"[" . $e->get_errnum() . "] " . $e->get_apiname() . ": " .
$e->get_errmsg() . "\n");
exit(1);
}
catch (Throwable $e) {
echo($e);
exit(1);
}
$p = 0;
?>