<?php
/*
 * (C) PDFlib GmbH 2015-2021 www.pdflib.com
 *
 * This module requires PDFlib TET (Text and Image Extraction Toolkit) installed
 * in PHP. It is available at https://www.pdflib.com/download/tet/
 *
 * Add the ".pdf" Extension to the list of preferred extensions for 
 * uploading files within the LocalSettings.php
 *   $wgFileExtensions[] = 'pdf';
 */

class PDFIndexer {
    public static function onUploadComplete(&$image){
        global $tetNewFileIndex, $wgScriptPath, $wgMaxArticleSize;
            
        $tooLarge = ($wgMaxArticleSize*1024) - 20; // maxsize without the comment

        $file = (dirname($_SERVER["SCRIPT_FILENAME"])) . 
            str_replace($wgScriptPath, "", $image->getLocalFile()->url);

        /* extract the extension of the destination filename */
        $extension = substr(strrchr($file, '.'),1);
        if (strtolower($extension)== "pdf"){
            try {
                /* place extracted text as plaintext to the description */

                $globaloptlist = "searchpath={" . dirname( __FILE__ ) . "/resource/cmap}";

                /* document-specific option list
                 *
                 * Indexing of password-protected documents that disallow text
                 * extraction is possible by using the "shrug" option of
                 * TET_open_document(). Please read the relevant section in the
                 * PDFlib Terms and Conditions and the TET Manual about the
                 * "shrug" option to understand the implications of using this
                 * feature.
                 *
                 * $docoptlist = "shrug";
                 */
                $docoptlist = "";

                /* page-specific option list */
                $pageoptlist = "granularity=page";

                $tet = new TET();

                $tet->set_option($globaloptlist);

                $doc = $tet->open_document($file, $docoptlist);
                if ($doc == -1)
                {
                    throw New Exception("[". $tet->get_errnum() . "] in " . $tet->get_apiname() . "(): " . $tet->get_errmsg());
                }

                /* Add document info entries to the Description field */
                $count = $tet->pcos_get_number($doc, "length:/Info");
            
                for ($i=0; $i < $count; $i++) {
                    $objtype = $tet->pcos_get_string($doc, "type:/Info[$i]");

                    /* Info entries can be stored as string or name objects */
                    if ($objtype == "string" || $objtype == "name") {
                        $tetNewFileIndex .= sprintf("%12s: %10s\n", 
                            $tet->pcos_get_string($doc, "/Info[$i].key"),
                            $tet->pcos_get_string($doc, "/Info[$i]"));
                    }
                }

                /* Get number of pages in the document */
                $n_pages = $tet->pcos_get_number($doc, "length:pages");

                /* Loop over all pages */
                for ($pageno = 1; $pageno <= $n_pages; ++$pageno) 
                {
                    /* reset time limit */
                    set_time_limit(60);
                    $page = $tet->open_page($doc, $pageno, $pageoptlist);

                    if ($page == -1)
                    {
                        continue;                        /* try next page */
                    }

                    $text = $tet->get_text($page);
                    $text = str_replace("-->","",$text);
                    $tetNewFileIndex .= $text;

                    $tet->close_page($page);
                }

                $tet->close_document($doc);

                $tet = 0;
            }
            catch (TETException $e) {
                wfDebugLog("PDFIndexer", "PDFIndexer: TET exception occurred:\n". $tet->get_errnum() . " in " . $tet->get_apiname() . "(): " . $tet->get_errmsg() . "\n");
                return true;
            }
            catch (Exception $e) {
                wfDebugLog("PDFIndexer","PDFIndexer: " . $e . "\n");
                return true;
            }
        }

        /* Check whether the extracted text is too long. 
         * If so, only store single words and truncate the word list 
         * if necessary. 
         */
        if (strlen($tetNewFileIndex)>$tooLarge){
            $tetNewFileIndex = implode(" ", array_unique(explode(" ", $tetNewFileIndex)));
            wfDebugLog("PDFIndexer","PDFIndexer: text length reached limit => build word list\n");
            if (strlen($tetNewFileIndex)>$tooLarge){
                $tetNewFileIndex = substr($tetNewFileIndex,0,$tooLarge);
                wfDebugLog("PDFIndexer","PDFIndexer: word list reached limit => truncate it\n");
            }
        }

        /* add comments around the text */
        $tetNewFileIndex = "<!-- \r\n" . $tetNewFileIndex . "\r\n//-->";

        if($tetNewFileIndex !== false){
            $title = $image->getLocalFile()->getTitle();
            $wikipage = new WikiPage($title);
            $content = new WikitextContent( $tetNewFileIndex );
            $wikipage->doEditContent($content, "PDFIndexer: PDF uploaded.\n", EDIT_NEW);
        }
        return true;
    }


    public static function onPageContentSave( &$renderedRevision, &$user, &$summary, &$flags, &$hookStatus ) {
        global $tetNewFileIndex;

        if ($tetNewFileIndex !== false ){
            /* store word list in $text */
            $summary = $tetNewFileIndex;
        }
        $tetNewFileIndex = false;
        return true; 
    }
}
?>
