/*
 * PDF text extractor based on PDFlib TET
 *
 * The text is written as UTF-8. The program uses the std::u8string
 * type and UTF-8 string literals that are available in C++20 and later.
 *
 * The program assumes that the program arguments are encoded as UTF-8.
 */

#include <iostream>
#include <iomanip>
#include <fstream>
#include <codecvt>

#define TET_CPP_U8STRING
#include "tet.hpp"

/* Figure out whether we're running on an EBCDIC-based machine. */
#if 'A' == 0xC1
#define NATIVE_ENCODING u8"ebcdicutf8"
#else
#define NATIVE_ENCODING u8"utf8"
#endif

using namespace pdflib;
using namespace std;

namespace
{

/*  global option list */
const u8string globaloptlist =
            u8"searchpath={{../data} {../../../resource/cmap}}";

/* document-specific  option list */
const u8string docoptlist = u8"";

/* page-specific option list */
const u8string pageoptlist = u8"granularity=page";

/* separator to emit after each chunk of text. This depends on the
 * applications needs; for granularity=word a space character may be useful.
 */
const u8string separator = u8"\n";

u8string get_u8string(const string& native_string);
string get_native_string(const u8string& utf8_string);

} // end anonymous namespace

int main(int argc, char **argv)
{
    ofstream out;
    int pageno = 0;
    
    try
    {
        TET tet;

        if (argc != 3)
        {
            cerr << "usage: extractor_u8string <infilename> <outfilename>" << endl;
            return 2;
        }

        out.open(argv[2], ios::binary);
        if (!out.is_open())
        {
            cerr << "Couldn't open output file " << argv[2] << endl;
            return 2;
        }

        // Write UTF-8 BOM
        out << "\xEF\xBB\xBF";

        tet.set_option(globaloptlist);

        /*
         * Caution: For simplicity we assume that the program arguments are
         * encoded as UTF-8, which might not be true in all cases!
         */
        const int doc = tet.open_document(get_u8string(argv[1]), docoptlist);

        if (doc == -1)
        {
            cerr << "Error " << tet.get_errnum()
                << " in " << get_native_string(tet.get_apiname()) << "(): "
                << get_native_string(tet.get_errmsg()) << endl;
            return 2;
        }

        /* get number of pages in the document */
        const int n_pages = (int) tet.pcos_get_number(doc, u8"length:pages");

        /* loop over pages in the document */
        for (pageno = 1; pageno <= n_pages; ++pageno)
        {
            const int page = tet.open_page(doc, pageno, pageoptlist);

            if (page == -1)
            {
                cerr << "Error " << tet.get_errnum()
                    << " in " << get_native_string(tet.get_apiname()) << "(): "
                    << get_native_string(tet.get_errmsg()) << endl;
                continue;                        /* try next page */
            }

            /* Retrieve all text fragments; This is actually not required
             * for granularity=page, but must be used for other granularities.
             */
            u8string text;
            while ((text = tet.get_text(page)) != u8"")
            {
                out << reinterpret_cast<char const *>(text.c_str())
                        << reinterpret_cast<char const *>(separator.c_str());
            }

            if (tet.get_errnum() != 0)
            {
                cerr << "Error " << tet.get_errnum()
                    << " in " << get_native_string(tet.get_apiname())
                    << "() on page " << pageno
                    << get_native_string(tet.get_errmsg()) << endl;
            }

            tet.close_page(page);
        }

        tet.close_document(doc);
    }
    catch (TET::Exception &ex) {
        if (pageno == 0)
        {
            cerr << "Error " << ex.get_errnum()
                << " in " << get_native_string(ex.get_apiname())
                << "(): " << get_native_string(ex.get_errmsg()) << endl;
        }
        else
        {
            cerr << "Error " << ex.get_errnum()
                << " in " << get_native_string(ex.get_apiname())
                << "() on page " << pageno
                << ": " << get_native_string(ex.get_errmsg()) << endl;
        }
        return 2;
    }
    catch (exception &e) {
        cerr << "C++ exception occurred: " << e.what() << endl;
        return 99;
    }
    catch (...) {
        cerr << "Generic C++ exception occurred!" << endl;
        return 99;
    }

    out.close();
    return 0;
}

namespace
{

// This TET object is used solely for conversion purposes between the encoding
// of the TET API parameters and the native encoding.
TET conversion_helper;

u8string get_u8string(const string& native_string)
{
    string const u8_bytes =
        conversion_helper.convert_to_unicode(NATIVE_ENCODING, native_string,
                                                    u8"outputformat=utf8");

    return u8string(
            reinterpret_cast<u8string::value_type const *>(u8_bytes.c_str()),
            u8_bytes.length() / sizeof(u8string::value_type));
}

string get_native_string(const u8string& utf8_string)
{
    string const u8_bytes(
            reinterpret_cast<string::value_type const *>(utf8_string.c_str()),
            utf8_string.length() * sizeof(u8string::value_type));

    return conversion_helper.convert_to_unicode(u8"utf8", u8_bytes,
                                            u8"outputformat=" NATIVE_ENCODING);
}

} // end anonymous namespace
