Resource-based image extractor based on PDFlib TET
Required software: TET 4
import com.pdflib.TET;
import com.pdflib.TETException;
/**
* Resource-based image extractor based on PDFlib TET
* <p>
* Required software: TET 4
* <p>
* Required data: PDF document
*
* @version $Id: image_resources.java,v 1.2 2010/07/27 11:25:51 stm Exp $
*/
public class image_resources
{
/**
* Global option list
*/
static final String globaloptlist = "searchpath={{../input}} ";
/**
* Document-specific option list
*/
static final String docoptlist = "";
/**
* Page-specific option list
*/
static final String pageoptlist = "";
/**
* Here you can insert basic image extract options (more below)
*/
static final String baseimageoptlist = "";
public static void main (String argv[])
{
TET tet = null;
try
{
if (argv.length != 1)
{
throw new Exception( "usage: image_resources <filename>");
}
String outfilebase = argv[0];
tet = new TET();
tet.set_option(globaloptlist);
int doc = tet.open_document(argv[0], docoptlist);
if (doc == -1)
{
throw new Exception("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "(): " + tet.get_errmsg());
}
/*
* Images will only be merged upon opening a page.
* In order to enumerate all merged image resources
* we open all pages before extracting the images.
*/
/* get number of pages in the document */
int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
/* loop over pages in the document */
for (int pageno = 1; pageno <= n_pages; ++pageno)
{
int page = tet.open_page(doc, pageno, pageoptlist);
if (page == -1)
{
print_tet_error(tet, pageno);
continue; /* try next page */
}
if (tet.get_errnum() != 0)
{
print_tet_error(tet, pageno);
}
tet.close_page(page);
}
/* get number of image resources in the document */
int n_images = (int) tet.pcos_get_number(doc, "length:images");
/* loop over image resources in the document */
for (int imageid = 0; imageid < n_images; imageid++)
{
/* examine image type */
int mergetype = (int) tet.pcos_get_number(doc,
"images["+ imageid + "]/mergetype");
/* skip images which have been consumed by merging */
if (mergetype == 0 || mergetype == 1)
{
report_image_info(tet, doc, imageid);
/*
* Fetch the image data and write it to a disk file. The
* output filenames are generated from the input filename by
* appending the image ID.
*/
String imageoptlist =
"filename={" + outfilebase + "_I" + imageid + "}";
if (tet.write_image_file(doc, imageid, imageoptlist) == -1)
{
print_tet_error(tet, 0);
continue; /* process next image */
}
}
}
tet.close_document(doc);
}
catch (TETException e)
{
System.err.println(
"TET exception occurred in image_resources sample:");
System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() +
": " + e.get_errmsg());
}
catch (Exception e)
{
System.err.println(e.getMessage());
}
finally
{
if (tet != null) {
tet.delete();
}
}
}
/**
* Report image info.
*
* Print the following information for each image:
*
* - page and image number
* - pCOS id (required for indexing the images[] array)
* - physical size of the placed image on the page
* - pixel size of the underlying PDF image
* - number of components, bits per component,and colorspace
* - mergetype if different from "normal", i.e. "artificial" (=merged)
* or "consumed"
*
* @param tet The TET object
* @param doc The document handle
* @param imageid The image ID
*/
private static void report_image_info(TET tet, int doc, int imageid)
throws com.pdflib.TETException
{
int width, height, bpc, cs, mergetype;
width = (int) tet.pcos_get_number(doc,
"images[" + imageid + "]/Width");
height = (int) tet.pcos_get_number(doc,
"images[" + imageid + "]/Height");
bpc = (int) tet.pcos_get_number(doc,
"images[" + imageid + "]/bpc");
cs = (int) tet.pcos_get_number(doc,
"images[" + imageid + "]/colorspaceid");
System.out.print("image I" + imageid);
System.out.print(", " + width + "x" + height + " pixel, ");
if (cs != -1)
{
System.out.print(
(int) tet.pcos_get_number(doc, "colorspaces["
+ cs + "]/components") + "x" + bpc + " bit " +
tet.pcos_get_string(doc, "colorspaces[" + cs
+ "]/name"));
}
else {
/* cs==-1 may happen for some JPEG 2000 images. bpc,
* colorspace name and number of components are not
* available in this case.
*/
System.out.print("JPEG2000");
}
mergetype = (int) tet.pcos_get_number(doc,
"images["+ imageid + "]/mergetype");
/* mergetype==0 means normal image */
if (mergetype != 0)
{
System.out.print(", mergetype=");
if (mergetype == 1)
System.out.print("artificial");
else
System.out.print("consumed");
}
System.out.println("");
}
/**
* Report a TET error.
*
* @param tet The TET object
* @param pageno The page number on which the error occurred
*/
private static void print_tet_error(TET tet, int pageno)
{
System.err.println("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "() on page " + pageno + ": "
+ tet.get_errmsg());
}
}