PDFlib
PDFlib

identify_ocr

Classify the pages in a document according to the following criteria:

No text and raster: The page contains neither text nor raster images, i.e. it is empty or contains only vector graphics

Image only (raw scan): The page contains at least one image, but not any text

Searchable image (also called image+hidden text): The page contains at least one image plus text with textrendering=3 (invisible). No other text is present on the page. 

 Visible text: The page contains text, and all text uses textrendering!=3

Mixed: all other cases, i.e. mixed textrendering modes

Download Java Code     Show Output     Show Input PDF

package com.pdflib.cookbook.tet.special;


import com.pdflib.TET;

import com.pdflib.TETException;


/**

 * Classify the pages in a document according to the following criteria:

 * <p>

 * - No text and raster: The page contains neither text nor raster images, i.e.

 * it is empty or contains only vector graphics<br>

 * - Image only (raw scan): The page contains at least one image, but not any

 * text<br>

 * - Searchable image (also called image+hidden text): The page contains at

 * least one image plus text with textrendering=3 (invisible). No other text is

 * present on the page.<br>

 * - Visible text: The page contains text, and all text uses textrendering!=3<br>

 * - Mixed: all other cases, i.e. mixed textrendering modes<br>

 * <p>

 * Required software: TET 3

 * <p>

 * Required data: PDF document

 *

 * @version $Id: identify_ocr.java,v 1.2 2008/11/21 07:37:42 stm Exp $

 */

class identify_ocr {

    /**

     * Global option list. The program expects the "resource" directory parallel

     * to the "java" directory.

     */

    private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap "

            + "../resource/glyphlist ../input}";


    /**

     * Document specific option list.

     */

    private static final String DOC_OPTLIST = "";


    /**

     * Page-specific option list.

     */

    private static final String PAGE_OPTLIST = "granularity=glyph";

   

    /**

     * The name of the input file

     */

    private String filename;

   

    /**

     * Invisible text rendering mode.

     */

    private static final int INVISIBLE_TEXT_RENDERING = 3;

   

    /**

     * Process a page in the document, and print out the classification of the

     * page.

     *

     * @param tet

     *            TET object

     * @param doc

     *            TET document handle

     * @param pageno

     *            Page to process

     *

     * @throws TETException

     *             An error occurred in the TET API

     */

    private static void process_page(TET tet, final int doc, int pageno)

            throws TETException {

        final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);


        if (page == -1) {

            System.err.println("Error " + tet.get_errnum() + " in "

                    + tet.get_apiname() + "(): " + tet.get_errmsg());

        }

        else {

            // Is invisible text present?

            boolean hasInvisibleText = false;

           

            // Is normal text present?

            boolean hasNormalText = false;

           

            /* Retrieve all text fragments for the page */

            for (String text = tet.get_text(page); text != null; text = tet

                    .get_text(page)) {

                /* loop over all characters */

                while (tet.get_char_info(page) != -1) {

                    if (tet.textrendering == INVISIBLE_TEXT_RENDERING) {

                        hasInvisibleText = true;

                    }

                    else {

                        hasNormalText = true;

                    }

                }

            }

           

            /* Check whether there's at least one raster image on the page */

            boolean hasImage = false;

            if (tet.get_errnum() == 0) {

                hasImage = tet.get_image_info(page) == 1;

            }


            if (tet.get_errnum() == 0) {

                boolean hasText = hasInvisibleText || hasNormalText;

               

                System.out.print("Page " + pageno + ": ");

                if (hasText) {

                    if (hasImage && hasInvisibleText && !hasNormalText) {

                        System.out.print("Searchable image");

                    }

                    else if (hasNormalText && !hasInvisibleText) {

                        System.out.print("Visible text");

                    }

                    else {

                        System.out.print("Mixed");

                    }

                }

                else {

                    // no text at all

                    if (hasImage) {

                        System.out.print("Image only");

                    }

                    else {

                        System.out.print("No text or raster graphics");

                    }               

                }

           

                System.out.println();

            }

            else {

                System.err.println("Error " + tet.get_errnum() + " in "

                        + tet.get_apiname() + "(): " + tet.get_errmsg());

            }


            tet.close_page(page);

        }

    }


    private void execute() {

        TET tet = null;

        int pageno = 0;

   

        try {

            tet = new TET();

            tet.set_option(GLOBAL_OPTLIST);

   

            final int doc = tet.open_document(filename, DOC_OPTLIST);

            if (doc == -1) {

                System.err.println("Error " + tet.get_errnum() + " in "

                        + tet.get_apiname() + "(): " + tet.get_errmsg());

                return;

            }

   

            System.out.println("Page classification for document \""

                    + filename + "\"");   

            /*

             * Loop over pages in the document

             */

            final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

            for (pageno = 1; pageno <= n_pages; ++pageno) {

                process_page(tet, doc, pageno);

            }

   

            tet.close_document(doc);

        }

        catch (TETException e) {

            if (pageno == 0) {

                System.err.println("Error " + e.get_errnum() + " in "

                        + e.get_apiname() + "(): " + e.get_errmsg() + "\n");

            }

            else {

                System.err.println("Error " + e.get_errnum() + " in "

                        + e.get_apiname() + "() on page " + pageno + ": "

                        + e.get_errmsg() + "\n");

            }

        }

        finally {

            tet.delete();

        }

    }

   

    /**

     * @param filename

     *            the name of the file for which the template will be

     *            generated

     */

    private identify_ocr(String filename) {

        this.filename = filename;

    }

   

    public static void main(String[] args) {

        if (args.length != 1) {

            System.out.println("usage: identify_ocr <infilename>");

            return;

        }


        identify_ocr t = new identify_ocr(args[0]);

        t.execute();

    }

}