/*
 * PDF text extractor based on PDFlib TET
 *
 * The text is written as UTF-16 in native byte order. The program uses the
 * std::u16string type and UTF-16 string literals that are available in
 * C++11 and later.
 *
 * The program assumes that the program arguments are encoded as UTF-8.
 */

#include <iostream>
#include <iomanip>
#include <fstream>
#include <codecvt>

#define TET_CPP_U16STRING
#include "tet.hpp"

/* Figure out whether we're running on an EBCDIC-based machine. */
#if 'A' == 0xC1
#define NATIVE_ENCODING u"ebcdicutf8"
#else
#define NATIVE_ENCODING u"utf8"
#endif

using namespace pdflib;
using namespace std;

namespace
{

/*  global option list */
const u16string globaloptlist =
            u"searchpath={{../data} {../../../resource/cmap}}";

/* document-specific  option list */
const u16string docoptlist = u"";

/* page-specific option list */
const u16string pageoptlist = u"granularity=page";

/* separator to emit after each chunk of text. This depends on the
 * applications needs; for granularity=word a space character may be useful.
 */
const u16string separator = u"\n";

u16string get_u16string(const string& native_string);
string get_native_string(const u16string& utf16_string);

} // end anonymous namespace

int main(int argc, char **argv)
{
    ofstream out;
    int pageno = 0;
    
    try
    {
        TET tet;

        if (argc != 3)
        {
            cerr << "usage: extractor_u16string <infilename> <outfilename>" << endl;
            return 2;
        }

        out.open(argv[2], ios::binary);
        if (!out.is_open())
        {
            cerr << "Couldn't open output file " << argv[2] << endl;
            return 2;
        }

        u16string::value_type const bom = 0xfeff;
        out.write(reinterpret_cast<char const *>(&bom), sizeof(u16string::value_type));

        tet.set_option(globaloptlist);

        /*
         * Caution: For simplicity we assume that the program arguments are
         * encoded as UTF-8, which might not be true in all cases!
         */
        u16string const doc_name(get_u16string(argv[1]));
        const int doc = tet.open_document(doc_name, docoptlist);

        if (doc == -1)
        {
            cerr << "Error " << tet.get_errnum()
                << " in " << get_native_string(tet.get_apiname()) << "(): "
                << get_native_string(tet.get_errmsg()) << endl;
            return 2;
        }

        /* get number of pages in the document */
        const int n_pages = (int) tet.pcos_get_number(doc, u"length:pages");

        /* loop over pages in the document */
        for (pageno = 1; pageno <= n_pages; ++pageno)
        {
            u16string text;
            const int page = tet.open_page(doc, pageno, pageoptlist);

            if (page == -1)
            {
                cerr << "Error " << tet.get_errnum()
                    << " in " << get_native_string(tet.get_apiname())
                    << "(): " << get_native_string(tet.get_errmsg()) << endl;
                continue;                        /* try next page */
            }

            /* Retrieve all text fragments; This is actually not required
             * for granularity=page, but must be used for other granularities.
             */
            while ((text = tet.get_text(page)) != u"")
            {
                /* print the retrieved text as UTF-16-encoded in the native
                 * byte order.
                 */
                out.write(reinterpret_cast<const char *>(text.c_str()),
                        static_cast<streamsize>(text.size())
                            * sizeof(u16string::value_type));

                /* print a separator between chunks of text */
                out.write(reinterpret_cast<const char *>(separator.c_str()),
                        static_cast<streamsize>(separator.size())
                            * sizeof(u16string::value_type));
            }

            if (tet.get_errnum() != 0)
            {
                cerr << "Error " << tet.get_errnum()
                    << " in " << get_native_string(tet.get_apiname())
                    << "() on page " << pageno
                    << ": " << get_native_string(tet.get_errmsg()) << endl;
            }

            tet.close_page(page);
        }

        tet.close_document(doc);
    }
    catch (TET::Exception &ex) {
        if (pageno == 0)
        {
            cerr << "Error " << ex.get_errnum()
                << " in " << get_native_string(ex.get_apiname())
                << "(): " << get_native_string(ex.get_errmsg()) << endl;
        }
        else
        {
            cerr << "Error " << ex.get_errnum()
                << " in " << get_native_string(ex.get_apiname())
                << "() on page " << pageno
                << ": " << get_native_string(ex.get_errmsg()) << endl;
        }
        return 2;
    }
    catch (exception &e) {
        cerr << "C++ exception occurred: " << e.what() << endl;
        return 99;
    }
    catch (...) {
        cerr << "Generic C++ exception occurred!" << endl;
        return 99;
    }

    out.close();
    return 0;
}

namespace
{

// This TET object is used solely for conversion purposes between the encoding
// of the TET API parameters and the native encoding.
TET conversion_helper;

u16string get_u16string(const string& native_string)
{
    string const u16_bytes =
        conversion_helper.convert_to_unicode(NATIVE_ENCODING, native_string,
                                                    u"outputformat=utf16");

    return u16string(
            reinterpret_cast<u16string::value_type const *>(u16_bytes.c_str()),
            u16_bytes.length() / sizeof(u16string::value_type));
}

string get_native_string(const u16string& utf16_string)
{
    string const u16_bytes(
            reinterpret_cast<string::value_type const *>(utf16_string.c_str()),
            utf16_string.length() * sizeof(u16string::value_type));

    return conversion_helper.convert_to_unicode(u"utf16", u16_bytes,
                                            u"outputformat=" NATIVE_ENCODING);
}

} // end anonymous namespace
