image/images_in_memory
Simple image reader
Download Java Code Show Output Show Input (invoice_pdfua1.pdf)
/*
* PDF image reader based on PDFlib TET. The example demonstrates the extraction
* of images in memory for feeding them to the Java Image I/O API in order to
* get the image metadata.
*
* The javax.imageio package that comes with the standard JRE is limited in what
* image formats it support. There is no support for JPEG2000 and JBIG2
* images. For these image formats it is possible to install plugins that extend
* the functionality of javax.imageio.
*
* Note that some PDF producers embed JPEG images in PDF which cannot be
* processed by javax.imageio, and may trigger exceptions like the following:
*
* javax.imageio.IIOException: Inconsistent metadata read from stream at
* com.sun.imageio.plugins.jpeg.JPEGMetadata.<init>(JPEGMetadata.java:362)
* at
* com.sun.imageio.plugins.jpeg.JPEGImageReader.getImageMetadata(JPEGImageReader.java:1023)
* at
* com.pdflib.cookbook.tet.image.images_in_memory.print_metadata(images_in_memory.java:264)
* at
* com.pdflib.cookbook.tet.image.images_in_memory.main(images_in_memory.java:191)
*
* Required software: TET 5.2
*
* Required data: PDF document
*
*/
package com.pdflib.cookbook.tet.image;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.metadata.IIOMetadata;
import javax.imageio.stream.ImageInputStream;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.pdflib.TETException;
import com.pdflib.TET;
public class images_in_memory {
/*
* Global option list
*/
static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}";
/*
* Document-specific option list
*/
static final String DOC_OPTLIST = "";
/*
* Page-specific option list
*/
static final String PAGE_OPTLIST = "granularity=page";
/*
* Basic image extract options (more below)
*/
static final String BASE_IMAGE_OPTLIST = "";
/*
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");
/*
* A sequence of blanks or tabs used for indenting the metadata tree.
*/
private static final String METADATA_INDENTATION = " ";
/*
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
public static void main(String argv[]) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
TET tet = null;
try {
if (argv.length != 1) {
throw new Exception("usage: images_in_memory <filename>");
}
tet = new TET();
tet.set_option(GLOBAL_OPTLIST);
int doc = tet.open_document(argv[0], DOC_OPTLIST);
if (doc == -1) {
throw new Exception(
"Error " + tet.get_errnum() + "in " + tet.get_apiname() + "(): " + tet.get_errmsg());
}
/* get number of pages in the document */
int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
/* loop over pages */
for (int pageno = 1; pageno <= n_pages; ++pageno) {
int page = tet.open_page(doc, pageno, PAGE_OPTLIST);
if (page < 0) {
print_tet_error(tet, pageno);
continue; /* try next page */
}
/* Retrieve all images on the page */
int imageno = -1;
while (tet.get_image_info(page) == 1) {
imageno++;
/*
* Invoke the write_image_file routine with option "typeonly", to find out the
* type of the image without actually writing an imagefile.
*/
int imageType = tet.write_image_file(doc, tet.imageid, BASE_IMAGE_OPTLIST + " typeonly");
/*
* Map the numerical image type to a string identifier
*/
String imageFormat;
switch (imageType) {
case TET.IF_TIFF:
imageFormat = "tiff";
break;
case TET.IF_JPEG:
imageFormat = "jpg";
break;
case TET.IF_JP2:
imageFormat = "jp2";
break;
case TET.IF_JPF:
imageFormat = "jpf";
break;
case TET.IF_J2K:
imageFormat = "j2k";
break;
case TET.IF_JBIG2:
imageFormat = "jbig2";
break;
default:
System.err.println(
"Page " + pageno + " image " + imageno + ": write_image_file returned unknown value "
+ imageType + ", skipping image, error: " + tet.get_errmsg());
continue;
}
/*
* Fetch the image data in memory.
*/
byte imagedata[] = tet.get_image_data(doc, tet.imageid, BASE_IMAGE_OPTLIST);
if (imagedata == null) {
print_tet_error(tet, pageno);
continue; /* process next image */
}
/*
* Do something meaningful with the data. Here we try to extract image metadata.
*/
print_metadata(imagedata, imageFormat, pageno, imageno + 1);
}
if (tet.get_errnum() != 0) {
print_tet_error(tet, pageno);
}
tet.close_page(page);
}
tet.close_document(doc);
} catch (TETException e) {
System.err.println("TET exception occurred in extractor sample:");
System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() + ": " + e.get_errmsg());
System.exit(1);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
} finally {
if (tet != null) {
tet.delete();
}
}
}
/*
* Try to consume the the binary image data with the Java ImageReader class, and
* print out any available image metadata.
*
* @param imagedata The binary data of the image.
* @param imageFormat The name of the image format.
* @param pageno The page number on which the image was found.
* @param imageno The number of the image on the page.
*
* @throws IOException An error occured in the ImageIO API.
*/
private static void print_metadata(byte[] imagedata, String imageFormat, int pageno, int imageno)
throws IOException {
/*
* Try to consume the the binary imagedata with the Java ImageReader class.
* First try to find a suitable ImageReader class.
*/
Iterator<ImageReader> readerIterator = ImageIO.getImageReadersByFormatName(imageFormat);
if (readerIterator != null && readerIterator.hasNext()) {
/*
* We try only the first ImageReader from potentially multiple ImageReaders.
*/
ImageReader reader = (ImageReader) readerIterator.next();
/*
* Create an ImageInputStream from the binary data and feed it to the
* ImageReader instance.
*/
ImageInputStream inputStream = ImageIO.createImageInputStream(new ByteArrayInputStream(imagedata));
reader.setInput(inputStream);
/*
* Try to retrieve the metadata and print it if available.
*/
IIOMetadata metadata;
try {
metadata = reader.getImageMetadata(0);
if (metadata != null) {
String format = metadata.getNativeMetadataFormatName();
Node tree = metadata.getAsTree(format);
print_metadata(tree, pageno, imageno);
}
} catch (IOException e) {
System.err.println("getImageMetadata() raised exception (page " + pageno + ", image " + imageno + "):");
e.printStackTrace();
}
} else {
System.err.println("No Java ImageReader available for suffix " + imageFormat);
}
}
/*
* Print out the metadata in the DOM tree.
*
* @param tree The DOM tree to print.
* @param pageno The page number of the image to which the metadata belongs.
* @param imageno The number of the image on the page.
*/
private static void print_metadata(Node tree, int pageno, int imageno) {
out.println("Metadata for image " + imageno + " on page " + pageno + ":");
print_metadata(tree, 0);
}
/*
* Recursively walk the DOM subtree given by node and print out node name and
* attributes.
*
* @param node The subtree to print.
* @param level The current level in the total tree, used for indentation.
*/
private static void print_metadata(Node node, int level) {
String indentation = get_indentation(level);
out.print(indentation + "node=\"" + node.getNodeName() + "\"");
String value = node.getNodeValue();
if (value != null) {
out.print(" value=\"" + value + "\"");
}
out.println();
NamedNodeMap map = node.getAttributes();
if (map != null) {
int length = map.getLength();
if (length > 0) {
out.print(indentation + " ");
for (int i = 0; i < length; i++) {
Node attr = map.item(i);
out.print(" " + attr.getNodeName() + "=\"" + attr.getNodeValue() + "\"");
}
out.println();
}
}
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i += 1) {
print_metadata(children.item(i), level + 1);
}
}
/*
* Produce an indentation string according to parameter level.
*
* @param level Indentation level
*
* @return A string composed of the METADATA_INDENTATION times level
*/
private static String get_indentation(int level) {
StringBuffer indentation = new StringBuffer();
for (int i = 0; i < level; i += 1) {
indentation.append(METADATA_INDENTATION);
}
return indentation.toString();
}
/*
* Report a TET error.
*
* @param tet The TET object
* @param pageno The page number on which the error occurred
*/
private static void print_tet_error(TET tet, int pageno) {
System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": "
+ tet.get_errmsg());
}
}