Simple image reader
Download Java Code Show Output Show Input (invoice_pdfua1.pdf)
* PDF image reader based on PDFlib TET. The example demonstrates the extraction
* of images in memory for feeding them to the Java Image I/O API in order to
* get the image metadata.
* The javax.imageio package that comes with the standard JRE is limited in what
* image formats it support. There is no support for JPEG2000 and JBIG2
* images. For these image formats it is possible to install plugins that extend
* the functionality of javax.imageio.
* Note that some PDF producers embed JPEG images in PDF which cannot be
* processed by javax.imageio, and may trigger exceptions like the following:
* javax.imageio.IIOException: Inconsistent metadata read from stream at
* com.sun.imageio.plugins.jpeg.JPEGMetadata.<init>(
* at
* com.sun.imageio.plugins.jpeg.JPEGImageReader.getImageMetadata(
* at
* com.pdflib.cookbook.tet.image.images_in_memory.print_metadata(
* at
* com.pdflib.cookbook.tet.image.images_in_memory.main(
* Required software: TET 5.2
* Required data: PDF document
package com.pdflib.cookbook.tet.image;
import java.util.Iterator;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.metadata.IIOMetadata;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.pdflib.TETException;
import com.pdflib.TET;
public class images_in_memory {
* Global option list
static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}";
* Document-specific option list
static final String DOC_OPTLIST = "";
* Page-specific option list
static final String PAGE_OPTLIST = "granularity=page";
* Basic image extract options (more below)
static final String BASE_IMAGE_OPTLIST = "";
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");
* A sequence of blanks or tabs used for indenting the metadata tree.
private static final String METADATA_INDENTATION = " ";
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
private static PrintStream out;
public static void main(String argv[]) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
TET tet = null;
try {
if (argv.length != 1) {
throw new Exception("usage: images_in_memory <filename>");
tet = new TET();
int doc = tet.open_document(argv[0], DOC_OPTLIST);
if (doc == -1) {
throw new Exception(
"Error " + tet.get_errnum() + "in " + tet.get_apiname() + "(): " + tet.get_errmsg());
/* get number of pages in the document */
int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
/* loop over pages */
for (int pageno = 1; pageno <= n_pages; ++pageno) {
int page = tet.open_page(doc, pageno, PAGE_OPTLIST);
if (page < 0) {
print_tet_error(tet, pageno);
continue; /* try next page */
/* Retrieve all images on the page */
int imageno = -1;
while (tet.get_image_info(page) == 1) {
* Invoke the write_image_file routine with option "typeonly", to find out the
* type of the image without actually writing an imagefile.
int imageType = tet.write_image_file(doc, tet.imageid, BASE_IMAGE_OPTLIST + " typeonly");
* Map the numerical image type to a string identifier
String imageFormat;
switch (imageType) {
imageFormat = "tiff";
imageFormat = "jpg";
case TET.IF_JP2:
imageFormat = "jp2";
case TET.IF_JPF:
imageFormat = "jpf";
case TET.IF_J2K:
imageFormat = "j2k";
case TET.IF_JBIG2:
imageFormat = "jbig2";
"Page " + pageno + " image " + imageno + ": write_image_file returned unknown value "
+ imageType + ", skipping image, error: " + tet.get_errmsg());
* Fetch the image data in memory.
byte imagedata[] = tet.get_image_data(doc, tet.imageid, BASE_IMAGE_OPTLIST);
if (imagedata == null) {
print_tet_error(tet, pageno);
continue; /* process next image */
* Do something meaningful with the data. Here we try to extract image metadata.
print_metadata(imagedata, imageFormat, pageno, imageno + 1);
if (tet.get_errnum() != 0) {
print_tet_error(tet, pageno);
} catch (TETException e) {
System.err.println("TET exception occurred in extractor sample:");
System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() + ": " + e.get_errmsg());
} catch (Exception e) {
} finally {
if (tet != null) {
* Try to consume the the binary image data with the Java ImageReader class, and
* print out any available image metadata.
* @param imagedata The binary data of the image.
* @param imageFormat The name of the image format.
* @param pageno The page number on which the image was found.
* @param imageno The number of the image on the page.
* @throws IOException An error occured in the ImageIO API.
private static void print_metadata(byte[] imagedata, String imageFormat, int pageno, int imageno)
throws IOException {
* Try to consume the the binary imagedata with the Java ImageReader class.
* First try to find a suitable ImageReader class.
Iterator<ImageReader> readerIterator = ImageIO.getImageReadersByFormatName(imageFormat);
if (readerIterator != null && readerIterator.hasNext()) {
* We try only the first ImageReader from potentially multiple ImageReaders.
ImageReader reader = (ImageReader);
* Create an ImageInputStream from the binary data and feed it to the
* ImageReader instance.
ImageInputStream inputStream = ImageIO.createImageInputStream(new ByteArrayInputStream(imagedata));
* Try to retrieve the metadata and print it if available.
IIOMetadata metadata;
try {
metadata = reader.getImageMetadata(0);
if (metadata != null) {
String format = metadata.getNativeMetadataFormatName();
Node tree = metadata.getAsTree(format);
print_metadata(tree, pageno, imageno);
} catch (IOException e) {
System.err.println("getImageMetadata() raised exception (page " + pageno + ", image " + imageno + "):");
} else {
System.err.println("No Java ImageReader available for suffix " + imageFormat);
* Print out the metadata in the DOM tree.
* @param tree The DOM tree to print.
* @param pageno The page number of the image to which the metadata belongs.
* @param imageno The number of the image on the page.
private static void print_metadata(Node tree, int pageno, int imageno) {
out.println("Metadata for image " + imageno + " on page " + pageno + ":");
print_metadata(tree, 0);
* Recursively walk the DOM subtree given by node and print out node name and
* attributes.
* @param node The subtree to print.
* @param level The current level in the total tree, used for indentation.
private static void print_metadata(Node node, int level) {
String indentation = get_indentation(level);
out.print(indentation + "node=\"" + node.getNodeName() + "\"");
String value = node.getNodeValue();
if (value != null) {
out.print(" value=\"" + value + "\"");
NamedNodeMap map = node.getAttributes();
if (map != null) {
int length = map.getLength();
if (length > 0) {
out.print(indentation + " ");
for (int i = 0; i < length; i++) {
Node attr = map.item(i);
out.print(" " + attr.getNodeName() + "=\"" + attr.getNodeValue() + "\"");
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i += 1) {
print_metadata(children.item(i), level + 1);
* Produce an indentation string according to parameter level.
* @param level Indentation level
* @return A string composed of the METADATA_INDENTATION times level
private static String get_indentation(int level) {
StringBuffer indentation = new StringBuffer();
for (int i = 0; i < level; i += 1) {
return indentation.toString();
* Report a TET error.
* @param tet The TET object
* @param pageno The page number on which the error occurred
private static void print_tet_error(TET tet, int pageno) {
System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": "
+ tet.get_errmsg());