/*
 * PDF text extractor based on PDFlib TET
 *
 * The text is written as UTF-8. The program uses the std::string
 * type and normal string literals, which by convetion are interpreted by
 * TET as being UTF-8-encoded.
 *
 * The program assumes that the program arguments are encoded as UTF-8.
 */

#include <iostream>
#include <iomanip>
#include <fstream>
#include <codecvt>

#define TET_CPP_STRING
#include "tet.hpp"

using namespace pdflib;
using namespace std;

namespace
{

/*  global option list */
const string globaloptlist =
            "searchpath={{../data} {../../../resource/cmap}}";

/* document-specific  option list */
const string docoptlist = "";

/* page-specific option list */
const string pageoptlist = "granularity=page";

} // end anonymous namespace

int main(int argc, char **argv)
{
    ofstream out;
    int pageno = 0;
    
    try
    {
        TET tet;

        if (argc != 3)
        {
            cerr << "usage: extractor_string <infilename> <outfilename>" << endl;
            return 2;
        }

        out.open(argv[2], ios::binary);
        if (!out.is_open())
        {
            cerr << "Couldn't open output file " << argv[2] << endl;
            return 2;
        }

        // Write UTF-8 BOM
        out << "\xEF\xBB\xBF";

        tet.set_option(globaloptlist);

        /*
         * Caution: For simplicity we assume that the program arguments are
         * encoded as UTF-8, which might not be true in all cases!
         */
        const int doc = tet.open_document(argv[1], docoptlist);

        if (doc == -1)
        {
            cerr << "Error " << tet.get_errnum()
                << " in " << tet.get_apiname().c_str() << "(): "
                << tet.get_errmsg().c_str() << endl;
            return 2;
        }

        /* get number of pages in the document */
        const int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

        /* loop over pages in the document */
        for (pageno = 1; pageno <= n_pages; ++pageno)
        {
            const int page = tet.open_page(doc, pageno, pageoptlist);

            if (page == -1)
            {
                cerr << "Error " << tet.get_errnum()
                    << " in " << tet.get_apiname().c_str() << "(): "
                    << tet.get_errmsg().c_str() << endl;
                continue;                        /* try next page */
            }

            /* Retrieve all text fragments; This is actually not required
             * for granularity=page, but must be used for other granularities.
             */
            string text;
            while ((text = tet.get_text(page)) != "")
            {
                out << text.c_str() << endl;
            }

            if (tet.get_errnum() != 0)
            {
                cerr << "Error " << tet.get_errnum()
                    << " in " << tet.get_apiname().c_str()
                    << "() on page " << pageno
                    << tet.get_errmsg().c_str() << endl;
            }

            tet.close_page(page);
        }

        tet.close_document(doc);
    }
    catch (TET::Exception &ex) {
        if (pageno == 0)
        {
            cerr << "Error " << ex.get_errnum()
                << " in " << ex.get_apiname().c_str()
                << "(): " << ex.get_errmsg().c_str() << endl;
        }
        else
        {
            cerr << "Error " << ex.get_errnum()
                << " in " << ex.get_apiname().c_str()
                << "() on page " << pageno
                << ": " << ex.get_errmsg().c_str() << endl;
        }
        return 2;
    }
    catch (exception &e) {
        cerr << "C++ exception occurred: " << e.what() << endl;
        return 99;
    }
    catch (...) {
        cerr << "Generic C++ exception occurred!" << endl;
        return 99;
    }

    out.close();
    return 0;
}
