TET Cookbook

cookbook

get_attachments

Extract the text from the document and recursively from all embedded PDF attachments.

 

Download Java Code     Show Output     Show Input PDF

package com.pdflib.cookbook.tet.special;

import java.io.PrintStream;
import java.io.UnsupportedEncodingException;

import com.pdflib.TET;
import com.pdflib.TETException;

/**
 * Extract the text from the document and recursively from all embedded PDF
 * attachments.
 * <p>
 * Required software: TET 3
 * <p>
 * Required data: PDF document
 * 
 * @version $Id: get_attachments.java,v 1.3 2008/11/20 08:06:39 stm Exp $
 */
public class get_attachments
{
    /**
     * Global option list. The program expects the "resource" directory parallel
     * to the "java" directory.
     */
    static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap "
        + "../resource/glyphlist ../input}";

    /**
     * Document specific option list.
     */
    static final String DOC_OPTLIST = "";

    /**
     * Page-specific option list.
     */
    static final String PAGE_OPTLIST = "granularity=page";

    /**
     * The encoding in which the output is sent to System.out. For running
     * the example in a Windows command window, you can set this for example to
     * "windows-1252" for getting Latin-1 output.
     */
    private static final String OUTPUT_ENCODING =
                            System.getProperty("file.encoding");
    
    /**
     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.
     */
    private static PrintStream out;

    /**
     * Separator to emit after each chunk of text. This depends on the
     * application's needs; for granularity=word a space character may be
     * useful.
     */
    static final String SEPARATOR = "\n";

    /**
     * Extract text from a document for which a TET handle is already available.
     * 
     * @param tet
     *            The TET object
     * @param doc
     *            A valid TET document handle
     * 
     * @throws TETException
     *             An error occurred in the TET API
     */
    static void extract_text(TET tet, int doc) throws TETException {
        /*
         * Get number of pages in the document.
         */
        int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

        /* loop over pages */
        for (int pageno = 1; pageno <= n_pages; ++pageno) {
            String text;
            int page;

            page = tet.open_page(doc, pageno, PAGE_OPTLIST);

            if (page == -1) {
                System.err.println("Error " + tet.get_errnum() + " in  "
                        + tet.get_apiname() + "() on page " + pageno + ": "
                        + tet.get_errmsg());
                continue; /* try next page */
            }

            /*
             * Retrieve all text fragments; This loop is actually not required
             * for granularity=page, but must be used for other granularities.
             */
            while ((text = tet.get_text(page)) != null) {
                out.print(text); // print the retrieved text

                /* print a separator between chunks of text */
                out.print(SEPARATOR);
            }

            if (tet.get_errnum() != 0) {
                System.err.println("Error " + tet.get_errnum() + " in  "
                        + tet.get_apiname() + "() on page " + pageno + ": "
                        + tet.get_errmsg());
            }

            tet.close_page(page);
        }
    }

    /**
     * Open a named physical or virtual file, extract the text from it, search
     * for document or page attachments, and process these recursively. Either
     * filename must be supplied for physical files, or data+length from which a
     * virtual file will be created. The caller cannot create the PVF file since
     * we create a new TET object here in case an exception happens with the
     * embedded document - the caller can happily continue with his TET object
     * even in case of an exception here.
     * 
     * @param filename
     *            The filename for an input file on disk (can be null)
     * @param attachmentname
     *            The name of the attachment for displaying it to the user
     * @param data
     *            Data of a PDF document loaded in memory (can be null)
     * 
     * @return 0 if successful, otherwise a non-null code to be used as exit
     *         status
     */
    static int process_document(String filename, String attachmentname,
            byte[] data) {
        int retval = 0;
        TET tet = null;
        try {
            final String pvfname = "/pvf/attachment";

            tet = new TET();

            /*
             * Construct a PVF file if data instead of a filename was provided
             */
            if (filename == null || filename.length() == 0) {
                tet.create_pvf(pvfname, data, "");
                filename = pvfname;
            }

            tet.set_option(GLOBAL_OPTLIST);

            int doc = tet.open_document(filename, DOC_OPTLIST);
            if (doc == -1) {
                System.err.println("Error " + tet.get_errnum() + " in  "
                        + tet.get_apiname() + "() (source: attachment '"
                        + attachmentname + "'): " + tet.get_errmsg());

                retval = 5;
            }
            else {
                process_document(tet, doc);
            }

            /*
             * If there was no PVF file deleting it won't do any harm
             */
            tet.delete_pvf(pvfname);
        }
        catch (TETException e) {
            System.err.println("Error " + e.get_errnum() + " in  "
                    + e.get_apiname() + "() (source: attachment '" + attachmentname
                    + "'): " + e.get_errmsg());
            retval = 1;
        }
        finally {
            if (tet != null) {
                tet.delete();
            }
        }

        return retval;
    }

    /**
     * Process a single file.
     * 
     * @param tet
     *            The TET object
     * @param doc
     *            The TET document handle
     * 
     * @throws TETException
     *             An error occurred in the TET API.
     */
    private static void process_document(TET tet, int doc) throws TETException {
        String objtype;

        // -------------------- Extract the document's own page contents
        extract_text(tet, doc);

        // -------------------- Process all document-level file attachments

        // Get the number of document-level file attachments.
        int filecount = (int) tet.pcos_get_number(doc,
                "length:names/EmbeddedFiles");

        for (int file = 0; file < filecount; file++) {
            String attname;

            /*
             * fetch the name of the file attachment; check for Unicode file
             * name (a PDF 1.7 feature)
             */
            objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles["
                    + file + "]/UF");

            if (objtype.equals("string")) {
                attname = tet.pcos_get_string(doc, "names/EmbeddedFiles["
                        + file + "]/UF");
            }
            else {
                objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles["
                        + file + "]/F");

                if (objtype.equals("string")) {
                    attname = tet.pcos_get_string(doc, "names/EmbeddedFiles["
                            + file + "]/F");
                }
                else {
                    attname = "(unnamed)";
                }
            }
            /* fetch the contents of the file attachment and process it */
            objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles["
                    + file + "]/EF/F");

            if (objtype.equals("stream")) {
                out.println("----- File attachment '" + attname + "':");
                byte attdata[] = tet.pcos_get_stream(doc, "",
                        "names/EmbeddedFiles[" + file + "]/EF/F");

                process_document(null, attname, attdata);
                out.println("----- End file attachment '" + attname + "'");
            }
        }

        // -------------------- Process all page-level file attachments

        int pagecount = (int) tet.pcos_get_number(doc, "length:pages");

        // Check all pages for annotations of type FileAttachment
        for (int page = 0; page < pagecount; page++) {
            int annotcount = (int) tet.pcos_get_number(doc, "length:pages["
                    + page + "]/Annots");

            for (int annot = 0; annot < annotcount; annot++) {
                String val;
                String attname;

                val = tet.pcos_get_string(doc, "pages[" + page + "]/Annots["
                        + annot + "]/Subtype");

                attname = "page " + (page + 1) + ", annotation " + (annot + 1);
                if (val.equals("FileAttachment")) {
                    String attpath = "pages[" + page + "]/Annots[" + annot
                            + "]/FS/EF/F";
                    /*
                     * fetch the contents of the attachment and process it
                     */
                    objtype = tet.pcos_get_string(doc, "type:" + attpath);

                    if (objtype.equals("stream")) {
                        out.println("----- Page level attachment '" + attname + "':");
                        byte attdata[] = tet.pcos_get_stream(doc, "", attpath);
                        process_document(null, attname, attdata);
                        out.println("----- End page level attachment '" + attname + "':");
                    }
                }
            }
        }

        tet.close_document(doc);
    }

    public static void main(String[] args) throws UnsupportedEncodingException {
        int ret = 0;

        System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
        out = new PrintStream(System.out, true, OUTPUT_ENCODING);

        if (args.length != 1) {
            System.err.println("usage: get_attachments <infilename>");
            System.exit(2);
        }

        try {
            ret = process_document(args[0], args[0], null);
        }
        catch (Exception e) {
            e.printStackTrace();
            ret = 1;
        }

        System.exit(ret);
    }
}