interchange/dpart_to_xml

Transform a PDF/VT DPart tree to XML.
Download Java Code Show Output Show Input (starter_pdfvt1.pdf)
/*
 * pCOS sample application for dumping the "Document Part Hierarchy" of a PDF/VT
 * document as XML as specified in Annex D of ISO 16612-2:2010. If no Document
 * Part Hierarchy is present, an XML file with an empty "<PDFVT>" element is
 * created.
 *
 * In order to generate well-formed XML a Java-specific method for generating
 * the XML output is used. When porting this code to other programming
 * languages, this must be replaced accordingly.
 *
 * The code to dump a DPM dictionary is not protected against incorrect endless
 * recursive dictionaries. A check for this should be implemented in
 * production code.
 *
 * Required software: pCOS interface 8 (PDFlib+PDI/PPS 9, TET 4.1, PLOP 5.0)
 * Required data: PDF/VT or PDF 2.0 document with DParts
 */
package com.pdflib.cookbook.pcos.interchange;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.NumberFormat;
import java.util.Locale;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.xml.sax.helpers.AttributesImpl;

import com.pdflib.IpCOS;
import com.pdflib.cookbook.pcos.pcos_cookbook_example;

public class dpart_to_xml extends pcos_cookbook_example {

    /**
     * Max. length of a sub-array in the DParts array of a DPart dictionary.
     */
    private static final int DPARTS_MAX_LENGTH = 8192;

    /* This is where the data files are. Adjust as necessary. */
    private final static String SEARCH_PATH = "../input";

    public void example_code(IpCOS p, int doc) throws Exception {

        /* Open the XML output document and set up an XML serialization to it */
        String filename = p.pcos_get_string(doc, "filename");
        File input_file = new File(filename);
        String basename = input_file.getName();
        String xml_name = basename + ".dpart.xml";
        Writer xml = new BufferedWriter(new OutputStreamWriter(
            new FileOutputStream(xml_name), "UTF-8"));
        System.out.println("Writing Document Part Hierarchy to file "
            + xml_name);

        StreamResult xml_result = new StreamResult(xml);
        SAXTransformerFactory transformer_factory =
            (SAXTransformerFactory) SAXTransformerFactory.newInstance();
        transformer_factory.setAttribute("indent-number", Integer.valueOf(2));

        TransformerHandler handler =
            transformer_factory.newTransformerHandler();
        Transformer serializer = handler.getTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
        serializer.setOutputProperty(OutputKeys.METHOD, "xml");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(xml_result);
        handler.startDocument();
        AttributesImpl attributes = new AttributesImpl();

        /*
         * Put some information about the input document into an XML comment
         */
        String comment = "\nInput PDF document: " + p.pcos_get_string(doc, "filename")
                + "\nPDF version: " + p.pcos_get_string(doc, "pdfversionstring")
                + "\nPDF/VT version: " + p.pcos_get_string(doc, "pdfvt")
                + "\n";
        handler.comment(comment.toCharArray(), 0, comment.length());
        
        /*
         * Fixed XML root element of the document.
         */
        handler.startElement("", "", "PDFVT", attributes);
        
        String objtype = p.pcos_get_string(doc, "type:/Root/DPartRoot");
        if (objtype.equals("dict")) {
            /*
             * Retrieve the required list of node names.
             */
            objtype = p.pcos_get_string(doc,
                "type:/Root/DPartRoot/NodeNameList");
            if (!objtype.equals("array")) {
                throw new Exception(
                    "Required entry \"NodeNameList\" is missing in DPartRoot dictionary or has wrong type");
            }

            int num_names = (int) p.pcos_get_number(doc,
                "length:/Root/DPartRoot/NodeNameList");
            String node_names[] = new String[num_names];
            for (int i = 0; i < num_names; i += 1) {
                node_names[i] = p.pcos_get_string(doc,
                    "/Root/DPartRoot/NodeNameList[" + i + "]");
            }

            objtype = p.pcos_get_string(doc,
                "type:/Root/DPartRoot/DPartRootNode");
            if (!objtype.equals("dict")) {
                throw new Exception(
                    "Required entry \"DPartRootNode\" is missing in DPartRoot dictionary or has wrong type");
            }

            /*
             * Invoke the recursive method that dumps the document part
             * hierarchy
             */
            dump_dpart_node(p, doc, handler, 0, 0, node_names,
                "/Root/DPartRoot/DPartRootNode");

        }
        else {
            throw new Exception(
                "DPartRoot dictionary missing from Catalog or has wrong type");
        }
        
        handler.endElement("", "", "PDFVT");

        xml.close();
    }

    /**
     * Recursively transform a DPart node to XML.
     * 
     * @param p
     *            IpCOS object
     * @param doc
     *            document handle
     * @param handler
     *            XML serialization handler
     * @param level
     *            level in DPart tree (zero-based)
     * @param start_page
     *            zero-based number of start page of DPart subtree
     * @param node_names
     *            array containing the names for the levels of the DPart tree
     * @param dpart_node_path
     *            pCOS path of the DPart node
     * @return number of pages covered by the DPart subtree
     * 
     * @throws Exception
     *             a pCOS error occurred
     * @throws Exception
     *             a non-recoverable error was discovered in the DPart tree
     */
    private int dump_dpart_node(IpCOS p, int doc, TransformerHandler handler,
        int level, int start_page, String node_names[], String dpart_node_path)
        throws  Exception {

        /*
         * Consistency check: The DPart tree may not be deeper than the length
         * of the NodeNameList array, as for every level of the tree the
         * corresponding entry from the NodeNameList array is used to name the
         * XML element
         */
        if (level >= node_names.length) {
            throw new Exception(
                "Document part hierarchy is deeper than allowed by the length of the NodeNameList array in the DPartRoot dictionary");
        }

        /*
         * XML attributes are not used in the DPart-to-XML conversion, but an
         * empty AttributesImpl object must be provided nevertheless.
         */
        AttributesImpl attributes = new AttributesImpl();
        handler.startElement("", "", node_names[level], attributes);

        /*
         * Transform the document part metadata to XML if present.
         */
        String dpm_path = dpart_node_path + "/DPM";
        String objtype = p.pcos_get_string(doc, "type:" + dpm_path);
        if (objtype.equals("dict")) {
            handler.startElement("", "", "DPM", attributes);
            dump_dpm(p, doc, handler, dpm_path);
            handler.endElement("", "", "DPM");
        }
        else if (!objtype.equals("null")) {
            System.err.println("Warning: Unexpected type \"" + objtype
                + "\" for DPM entry, skipping it (pCOS Path \"" + dpart_node_path + "\")");
        }

        /*
         * Retrieve information whether this is an inner node of the tree or
         * a leaf.
         */
        String dparts_array = dpart_node_path + "/DParts";
        String dparts_objtype = p.pcos_get_string(doc, "type:" + dparts_array);
        String start_objtype = p.pcos_get_string(doc, "type:" + dpart_node_path
            + "/Start");
        String end_objtype = p.pcos_get_string(doc, "type:" + dpart_node_path
            + "/End");
        
        if (!dparts_objtype.equals("null") && !start_objtype.equals("null")) {
            throw new Exception(
                "DPart dictionary contains both a DParts and a Start key (pCOS Path \""
                + dpart_node_path + "\")");
        }

        /* page count for subtree */
        int page_count = 0;
        
        if (!dparts_objtype.equals("null")) {
            if (dparts_objtype.equals("array")) {
                /*
                 * This is an array of arrays, where each sub-array has a
                 * maximum of 8192 entries.
                 */
                int dparts_length = (int) p.pcos_get_number(doc, "length:" + dparts_array);
                for (int i = 0; i < dparts_length; i += 1) {
                    String dparts_array_entry = dparts_array + "[" + i + "]";
                    objtype = p.pcos_get_string(doc, "type:" + dparts_array_entry);
                    if (!objtype.equals("array")) {
                        throw new Exception(
                            "DParts array entry has wrong type \"" + objtype
                                + "\" (pCOS path \"" + dparts_array_entry + "\")");
                    }
                    
                    /*
                     * Check that every sub-array but the last one has exactly
                     * 8192 entries.
                     */
                    int dparts_entry_length = (int) p.pcos_get_number(doc,
                        "length:" + dparts_array_entry);
                    if (i < dparts_length - 1 && dparts_entry_length != DPARTS_MAX_LENGTH) {
                        System.err.println("Warning: DParts sub-array has " + dparts_entry_length
                            + " entries, should be 8192 (pCOS path \"" + dparts_array_entry + "\")");
                    }
                    else if (dparts_entry_length > DPARTS_MAX_LENGTH) {
                        System.err.println("Warning: DParts sub-array has more than 8192 entries (pCOS path \"" + dparts_array_entry + "\")");
                    }
                    else if (dparts_entry_length == 0) {
                        System.err.println("Warning: Empty DParts sub-array (pCOS path \"" + dparts_array_entry + "\")");
                    }
                    
                    /*
                     * Recursively transform all subtrees to XML.
                     */
                    for (int j = 0; j < dparts_entry_length; j += 1) {
                        String dpart_child_path = dparts_array_entry + "[" + j + "]";
                        page_count += dump_dpart_node(p, doc, handler, level + 1, start_page + page_count,
                                        node_names, dpart_child_path);
                    }
                }
            }
            else {
                throw new Exception("DParts entry has wrong type \"" + objtype
                    + "\" (pCOS path \"" + dparts_array + "\")");
            }
        }
        else {
            /*
             * Enumerate the pages that belong to the document part. Here we
             * have only a reference to the first and, if there is more than one
             * page in the document part, to the last page. To avoid a
             * complicated walk through the PDF Page tree, we make use of the
             * the fact that the order of the page objects as defined by the
             * page tree is the same order in which Page objects are referenced
             * from leaf node DPart dictionaries in a depth-first traversal. By
             * maintaining a counter for the traversed pages we can use the pCOS
             * pseudo-object pages array.
             */

            if (!start_objtype.equals("dict")) {
                throw new Exception("Start entry has wrong type \"" + start_objtype
                    + "\" (pCOS path \"" + dpart_node_path + "/Start\")");
            }
            
            int start_id = (int) p.pcos_get_number(doc, "pcosid:"
                + dpart_node_path + "/Start");
            
            int end_id;
            if (end_objtype.equals("null")) {
                end_id = start_id;
            }
            else {
                if (!end_objtype.equals("dict")) {
                    throw new Exception("End entry has wrong type \"" + objtype
                        + "\" (pCOS path \"" + dpart_node_path + "/End\")");
                }
                end_id = (int) p.pcos_get_number(doc, "pcosid:"
                    + dpart_node_path + "/End");
                
                if (end_id == start_id) {
                    System.err.println("Warning: End entry present but points to the same page as Start entry (pCOS path \""
                        + dpart_node_path + "/End\")");
                }
            }
            
            /*
             * We cross-check that the traversal does not diverge from the
             * actual page order by comparing the object ids of the first and
             * the last page.
             */
            if (start_id != (int) p.pcos_get_number(doc, "pcosid:pages[" + start_page + "]")) {
                throw new Exception("Sequence of pages retrieved by depth-first traversal of document part tree does not map to order of Page objects");
            }
            
            /*
             * There is at least one page in the range. For each page produce
             * an empty <PDFPage/> element.
             */
            while (start_id != end_id) {
                page_count += 1;
                start_id = (int) p.pcos_get_number(doc, "pcosid:pages[" + (start_page + page_count) + "]");
                handler.startElement("", "", "PDFPage", attributes);
                handler.endElement("", "", "PDFPage");
            }
            page_count += 1;
            handler.startElement("", "", "PDFPage", attributes);
            handler.endElement("", "", "PDFPage");
            
            if (end_id != (int) p.pcos_get_number(doc, "pcosid:pages[" + (start_page + page_count - 1) + "]")) {
                throw new Exception("Sequence of pages retrieved by depth-first traversal of document part tree does not map to sequence of Page objects");
            }
        }

        handler.endElement("", "", node_names[level]);
        
        return page_count;
    }

    /**
     * Recursively transform an entry in the document part metadata to XML
     * according to Annex D of ISO 16612-2:2010.
     * 
     * @param p
     *            IpCOS object
     * @param doc
     *            document handle
     * @param handler
     *            XML serialization handler
     * @param dpm_path
     *            pCOS path of the DPM entry
     * @throws Exception
     *             a pCOS error occured
     * @throws Exception
     *             an error occurred in the XML serialization
     */
    private void dump_dpm(IpCOS p, int doc,
        TransformerHandler handler, String dpm_path) 
                                    throws  Exception {
        AttributesImpl attributes = new AttributesImpl();
        
        int dict_length = (int) p.pcos_get_number(doc, "length:" + dpm_path);
        for (int i = 0; i < dict_length; i += 1) {
            String entry_path = dpm_path + "[" + i + "]";
            
            String key = p.pcos_get_string(doc, entry_path + ".key");
            String xml_key = key.replace(':', '_');
            
            handler.startElement("", "", xml_key, attributes);
            dump_dpm_entry(p, doc, handler, entry_path + ".val");
            handler.endElement("", "", xml_key);
        }
    }

    /**
     * Transform a single entry in the DPM dictionary to XML.
     * 
     * @param p
     *            IpCOS object
     * @param doc
     *            document handle
     * @param handler
     *            XML serialization handler
     * @param entry_path
     *            pCOS path of the entry
     * @throws Exception
     *             a pCOS error occured
     * @throws Exception
     *             an error occurred in the XML serialization
     */
    private void dump_dpm_entry(IpCOS p, int doc, TransformerHandler handler,
            String value_path) throws  Exception {
        
        String objtype = p.pcos_get_string(doc, "type:" + value_path);
        
        if (objtype.equals("string") || objtype.equals("name")) {
            String value = p.pcos_get_string(doc, value_path);
            handler.characters(value.toCharArray(), 0, value.length());
        }
        else if (objtype.equals("array")) {
            int array_length = (int) p.pcos_get_number(doc, "length:" + value_path);
            AttributesImpl a = new AttributesImpl();
            for (int j = 0; j < array_length; j += 1) {
                handler.startElement("", "", "Item", a);
                dump_dpm_entry(p, doc, handler, value_path + "[" + j + "]");
                handler.endElement("", "", "Item");
            }
        }
        else if (objtype.equals("dict") || objtype.equals("stream") || objtype.equals("fstream")) {
            dump_dpm(p, doc, handler, value_path);
        }
        else if (objtype.equals("number")) {
            NumberFormat f = NumberFormat.getInstance(Locale.US);
            String value = f.format(p.pcos_get_number(doc, value_path));
            handler.characters(value.toCharArray(), 0, value.length());
        }
        else if (objtype.equals("boolean")) {
            String value = 
                (int) p.pcos_get_number(doc, value_path) != 0 ?
                    "true" : "false";
            handler.characters(value.toCharArray(), 0, value.length());
        }
    }

    public dpart_to_xml(String[] argv, String readable_name,
        String search_path) {
        super(argv, readable_name, search_path);
    }

    public static void main(String argv[]) {
        dpart_to_xml example = new dpart_to_xml(argv,
            "Document Part Hierarchy", SEARCH_PATH);
        example.execute();
    }
}
pCOS Cookbook

interchange/dpart_to_xml