BASKET
Search
PDFlib

interchange/dpart_to_xml

Transform a PDF/VT DPart tree to XML according to ISO 16612-2:2010

Download Java Code    Show Output     Show Input PDF

package com.pdflib.cookbook.pcos.interchange;


import java.io.BufferedWriter;

import java.io.File;

import java.io.FileOutputStream;

import java.io.OutputStreamWriter;

import java.io.Writer;

import java.text.NumberFormat;

import java.util.Locale;


import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.sax.SAXTransformerFactory;

import javax.xml.transform.sax.TransformerHandler;

import javax.xml.transform.stream.StreamResult;


import org.xml.sax.helpers.AttributesImpl;


import com.pdflib.IpCOS;

import com.pdflib.cookbook.pcos.pcos_cookbook_example;


/**

 * pCOS sample application for dumping the "Document Part Hierarchy" of a PDF/VT

 * document as XML as specified in Annex D of ISO 16612-2:2010. If no Document

 * Part Hierarchy is present, an XML file with an empty "<PDFVT>" element is

 * created.

 * <p>

 * In order to generate well-formed XML a Java-specific method for generating

 * the XML output is used. When porting this code to other programming

 * languages, this must be replaced accordingly.

 * <p>

 * The code to dump a DPM dictionary is not protected against incorrect endless

 * recursive dictionaries. A check for this should be implemented in

 * production code.

 * <p>

 * Required software: pCOS interface 3 (pCOS 3.x, PDFlib+PDI/PPS 7.x, TET 2.2,

 * PLOP 3.x) <br>

 * Required data: PDF/VT or PDF 2.0 document with DParts

 * <p>

 * @version $Id: dpart_to_xml.java,v 1.4 2015/11/16 11:53:16 stm Exp $

 */

public class dpart_to_xml extends pcos_cookbook_example {


    /**

     * Max. length of a sub-array in the DParts array of a DPart dictionary.

     */

    private static final int DPARTS_MAX_LENGTH = 8192;


    /* This is where the data files are. Adjust as necessary. */

    private final static String SEARCH_PATH = "../input";


    public void example_code(IpCOS p, int doc) throws Exception {


        /* Open the XML output document and set up an XML serialization to it */

        String filename = p.pcos_get_string(doc, "filename");

        File input_file = new File(filename);

        String basename = input_file.getName();

        String xml_name = basename + ".dpart.xml";

        Writer xml = new BufferedWriter(new OutputStreamWriter(

            new FileOutputStream(xml_name), "UTF-8"));

        System.out.println("Writing Document Part Hierarchy to file "

            + xml_name);


        StreamResult xml_result = new StreamResult(xml);

        SAXTransformerFactory transformer_factory =

            (SAXTransformerFactory) SAXTransformerFactory.newInstance();

        transformer_factory.setAttribute("indent-number", new Integer(2));


        TransformerHandler handler =

            transformer_factory.newTransformerHandler();

        Transformer serializer = handler.getTransformer();

        serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");

        serializer.setOutputProperty(OutputKeys.METHOD, "xml");

        serializer.setOutputProperty(OutputKeys.INDENT, "yes");

        handler.setResult(xml_result);

        handler.startDocument();

        AttributesImpl attributes = new AttributesImpl();


        /*

         * Put some information about the input document into an XML comment

         */

        String comment = "\nInput PDF document: " + p.pcos_get_string(doc, "filename")

                + "\nPDF version: " + p.pcos_get_string(doc, "pdfversionstring")

                + "\nPDF/VT version: " + p.pcos_get_string(doc, "pdfvt")

                + "\n";

        handler.comment(comment.toCharArray(), 0, comment.length());

       

        /*

         * Fixed XML root element of the document.

         */

        handler.startElement("", "", "PDFVT", attributes);

       

        String objtype = p.pcos_get_string(doc, "type:/Root/DPartRoot");

        if (objtype.equals("dict")) {

            /*

             * Retrieve the required list of node names.

             */

            objtype = p.pcos_get_string(doc,

                "type:/Root/DPartRoot/NodeNameList");

            if (!objtype.equals("array")) {

                throw new Exception(

                    "Required entry \"NodeNameList\" is missing in DPartRoot dictionary or has wrong type");

            }


            int num_names = (int) p.pcos_get_number(doc,

                "length:/Root/DPartRoot/NodeNameList");

            String node_names[] = new String[num_names];

            for (int i = 0; i < num_names; i += 1) {

                node_names[i] = p.pcos_get_string(doc,

                    "/Root/DPartRoot/NodeNameList[" + i + "]");

            }


            objtype = p.pcos_get_string(doc,

                "type:/Root/DPartRoot/DPartRootNode");

            if (!objtype.equals("dict")) {

                throw new Exception(

                    "Required entry \"DPartRootNode\" is missing in DPartRoot dictionary or has wrong type");

            }


            /*

             * Invoke the recursive method that dumps the document part

             * hierarchy

             */

            dump_dpart_node(p, doc, handler, 0, 0, node_names,

                "/Root/DPartRoot/DPartRootNode");


        }

        else {

            throw new Exception(

                "DPartRoot dictionary missing from Catalog or has wrong type");

        }

       

        handler.endElement("", "", "PDFVT");


        xml.close();

    }


    /**

     * Recursively transform a DPart node to XML.

     *

     * @param p

     *            IpCOS object

     * @param doc

     *            document handle

     * @param handler

     *            XML serialization handler

     * @param level

     *            level in DPart tree (zero-based)

     * @param start_page

     *            zero-based number of start page of DPart subtree

     * @param node_names

     *            array containing the names for the levels of the DPart tree

     * @param dpart_node_path

     *            pCOS path of the DPart node

     * @return number of pages covered by the DPart subtree

     *

     * @throws Exception

     *             a pCOS error occurred

     * @throws Exception

     *             a non-recoverable error was discovered in the DPart tree

     */

    private int dump_dpart_node(IpCOS p, int doc, TransformerHandler handler,

        int level, int start_page, String node_names[], String dpart_node_path)

        throws  Exception {


        /*

         * Consistency check: The DPart tree may not be deeper than the length

         * of the NodeNameList array, as for every level of the tree the

         * corresponding entry from the NodeNameList array is used to name the

         * XML element

         */

        if (level >= node_names.length) {

            throw new Exception(

                "Document part hierarchy is deeper than allowed by the length of the NodeNameList array in the DPartRoot dictionary");

        }


        /*

         * XML attributes are not used in the DPart-to-XML conversion, but an

         * empty AttributesImpl object must be provided nevertheless.

         */

        AttributesImpl attributes = new AttributesImpl();

        handler.startElement("", "", node_names[level], attributes);


        /*

         * Transform the document part metadata to XML if present.

         */

        String dpm_path = dpart_node_path + "/DPM";

        String objtype = p.pcos_get_string(doc, "type:" + dpm_path);

        if (objtype.equals("dict")) {

            handler.startElement("", "", "DPM", attributes);

            dump_dpm(p, doc, handler, dpm_path);

            handler.endElement("", "", "DPM");

        }

        else if (!objtype.equals("null")) {

            System.err.println("Warning: Unexpected type \"" + objtype

                + "\" for DPM entry, skipping it (pCOS Path \"" + dpart_node_path + "\")");

        }


        /*

         * Retrieve information whether this is an inner node of the tree or

         * a leaf.

         */

        String dparts_array = dpart_node_path + "/DParts";

        String dparts_objtype = p.pcos_get_string(doc, "type:" + dparts_array);

        String start_objtype = p.pcos_get_string(doc, "type:" + dpart_node_path

            + "/Start");

        String end_objtype = p.pcos_get_string(doc, "type:" + dpart_node_path

            + "/End");

       

        if (!dparts_objtype.equals("null") && !start_objtype.equals("null")) {

            throw new Exception(

                "DPart dictionary contains both a DParts and a Start key (pCOS Path \""

                + dpart_node_path + "\")");

        }


        /* page count for subtree */

        int page_count = 0;

       

        if (!dparts_objtype.equals("null")) {

            if (dparts_objtype.equals("array")) {

                /*

                 * This is an array of arrays, where each sub-array has a

                 * maximum of 8192 entries.

                 */

                int dparts_length = (int) p.pcos_get_number(doc, "length:" + dparts_array);

                for (int i = 0; i < dparts_length; i += 1) {

                    String dparts_array_entry = dparts_array + "[" + i + "]";

                    objtype = p.pcos_get_string(doc, "type:" + dparts_array_entry);

                    if (!objtype.equals("array")) {

                        throw new Exception(

                            "DParts array entry has wrong type \"" + objtype

                                + "\" (pCOS path \"" + dparts_array_entry + "\")");

                    }

                   

                    /*

                     * Check that every sub-array but the last one has exactly

                     * 8192 entries.

                     */

                    int dparts_entry_length = (int) p.pcos_get_number(doc,

                        "length:" + dparts_array_entry);

                    if (i < dparts_length - 1 && dparts_entry_length != DPARTS_MAX_LENGTH) {

                        System.err.println("Warning: DParts sub-array has " + dparts_entry_length

                            + " entries, should be 8192 (pCOS path \"" + dparts_array_entry + "\")");

                    }

                    else if (dparts_entry_length > DPARTS_MAX_LENGTH) {

                        System.err.println("Warning: DParts sub-array has more than 8192 entries (pCOS path \"" + dparts_array_entry + "\")");

                    }

                    else if (dparts_entry_length == 0) {

                        System.err.println("Warning: Empty DParts sub-array (pCOS path \"" + dparts_array_entry + "\")");

                    }

                   

                    /*

                     * Recursively transform all subtrees to XML.

                     */

                    for (int j = 0; j < dparts_entry_length; j += 1) {

                        String dpart_child_path = dparts_array_entry + "[" + j + "]";

                        page_count += dump_dpart_node(p, doc, handler, level + 1, start_page + page_count,

                                        node_names, dpart_child_path);

                    }

                }

            }

            else {

                throw new Exception("DParts entry has wrong type \"" + objtype

                    + "\" (pCOS path \"" + dparts_array + "\")");

            }

        }

        else {

            /*

             * Enumerate the pages that belong to the document part. Here we

             * have only a reference to the first and, if there is more than one

             * page in the document part, to the last page. To avoid a

             * complicated walk through the PDF Page tree, we make use of the

             * the fact that the order of the page objects as defined by the

             * page tree is the same order in which Page objects are referenced

             * from leaf node DPart dictionaries in a depth-first traversal. By

             * maintaining a counter for the traversed pages we can use the pCOS

             * pseudo-object pages array.

             */


            if (!start_objtype.equals("dict")) {

                throw new Exception("Start entry has wrong type \"" + start_objtype

                    + "\" (pCOS path \"" + dpart_node_path + "/Start\")");

            }

           

            int start_id = (int) p.pcos_get_number(doc, "pcosid:"

                + dpart_node_path + "/Start");

           

            int end_id;

            if (end_objtype.equals("null")) {

                end_id = start_id;

            }

            else {

                if (!end_objtype.equals("dict")) {

                    throw new Exception("End entry has wrong type \"" + objtype

                        + "\" (pCOS path \"" + dpart_node_path + "/End\")");

                }

                end_id = (int) p.pcos_get_number(doc, "pcosid:"

                    + dpart_node_path + "/End");

               

                if (end_id == start_id) {

                    System.err.println("Warning: End entry present but points to the same page as Start entry (pCOS path \""

                        + dpart_node_path + "/End\")");

                }

            }

           

            /*

             * We cross-check that the traversal does not diverge from the

             * actual page order by comparing the object ids of the first and

             * the last page.

             */

            if (start_id != (int) p.pcos_get_number(doc, "pcosid:pages[" + start_page + "]")) {

                throw new Exception("Sequence of pages retrieved by depth-first traversal of document part tree does not map to order of Page objects");

            }

           

            /*

             * There is at least one page in the range. For each page produce

             * an empty <PDFPage/> element.

             */

            while (start_id != end_id) {

                page_count += 1;

                start_id = (int) p.pcos_get_number(doc, "pcosid:pages[" + (start_page + page_count) + "]");

                handler.startElement("", "", "PDFPage", attributes);

                handler.endElement("", "", "PDFPage");

            }

            page_count += 1;

            handler.startElement("", "", "PDFPage", attributes);

            handler.endElement("", "", "PDFPage");

           

            if (end_id != (int) p.pcos_get_number(doc, "pcosid:pages[" + (start_page + page_count - 1) + "]")) {

                throw new Exception("Sequence of pages retrieved by depth-first traversal of document part tree does not map to sequence of Page objects");

            }

        }


        handler.endElement("", "", node_names[level]);

       

        return page_count;

    }


    /**

     * Recursively transform an entry in the document part metadata to XML

     * according to Annex D of ISO 16612-2:2010.

     *

     * @param p

     *            IpCOS object

     * @param doc

     *            document handle

     * @param handler

     *            XML serialization handler

     * @param dpm_path

     *            pCOS path of the DPM entry

     * @throws Exception

     *             a pCOS error occured

     * @throws Exception

     *             an error occurred in the XML serialization

     */

    private void dump_dpm(IpCOS p, int doc,

        TransformerHandler handler, String dpm_path)

                                    throws  Exception {

        AttributesImpl attributes = new AttributesImpl();

       

        int dict_length = (int) p.pcos_get_number(doc, "length:" + dpm_path);

        for (int i = 0; i < dict_length; i += 1) {

            String entry_path = dpm_path + "[" + i + "]";

           

            String key = p.pcos_get_string(doc, entry_path + ".key");

            String xml_key = key.replace(':', '_');

           

            handler.startElement("", "", xml_key, attributes);

            dump_dpm_entry(p, doc, handler, entry_path + ".val");

            handler.endElement("", "", xml_key);

        }

    }


    /**

     * Transform a single entry in the DPM dictionary to XML.

     *

     * @param p

     *            IpCOS object

     * @param doc

     *            document handle

     * @param handler

     *            XML serialization handler

     * @param entry_path

     *            pCOS path of the entry

     * @throws Exception

     *             a pCOS error occured

     * @throws Exception

     *             an error occurred in the XML serialization

     */

    private void dump_dpm_entry(IpCOS p, int doc, TransformerHandler handler,

            String value_path) throws  Exception {

       

        String objtype = p.pcos_get_string(doc, "type:" + value_path);

       

        if (objtype.equals("string") || objtype.equals("name")) {

            String value = p.pcos_get_string(doc, value_path);

            handler.characters(value.toCharArray(), 0, value.length());

        }

        else if (objtype.equals("array")) {

            int array_length = (int) p.pcos_get_number(doc, "length:" + value_path);

            AttributesImpl a = new AttributesImpl();

            for (int j = 0; j < array_length; j += 1) {

                handler.startElement("", "", "Item", a);

                dump_dpm_entry(p, doc, handler, value_path + "[" + j + "]");

                handler.endElement("", "", "Item");

            }

        }

        else if (objtype.equals("dict") || objtype.equals("stream") || objtype.equals("fstream")) {

            dump_dpm(p, doc, handler, value_path);

        }

        else if (objtype.equals("number")) {

            NumberFormat f = NumberFormat.getInstance(Locale.US);

            String value = f.format(p.pcos_get_number(doc, value_path));

            handler.characters(value.toCharArray(), 0, value.length());

        }

        else if (objtype.equals("boolean")) {

            String value =

                (int) p.pcos_get_number(doc, value_path) != 0 ?

                    "true" : "false";

            handler.characters(value.toCharArray(), 0, value.length());

        }

    }


    public dpart_to_xml(String[] argv, String readable_name,

        String search_path, String full_rcs_file_name, String revision) {

        super(argv, readable_name, search_path, full_rcs_file_name, revision);

    }


    public static void main(String argv[]) {

        dpart_to_xml example = new dpart_to_xml(argv,

            "Document Part Hierarchy", SEARCH_PATH,

            "$RCSfile: dpart_to_xml.java,v $", "$Revision: 1.4 $");

        example.execute();

    }

}