<?php
/* 
 * PDF text extractor which also searches PDF file attachments.
 */

  $infilename = "Portfolio_sample.pdf";
  $outfilename = "attachments.txt";
  /**
   * Global option list.
   */
  $globaloptlist = "searchpath={{../data} " .
    "{../../../resource/cmap}}";

  /**
   * Document specific option list.
   */
  $docoptlist = "";

  /**
   * Page-specific option list.
   */
  $pageoptlist = "granularity=page";

  /**
   * Separator to emit after each chunk of text. This depends on the
   * application's needs; for granularity=word a space character may be
   * useful.
   */
  $separator = "\n";

  /**
   * Extract text from a document for which a $tet->handle is already available.
   * 
   * @param tet
   *            The $tet->object
   * @param doc
   *            A valid $tet->document handle
   * @param outfp
   *            Output file handle
   * 
   */
  function extract_text($tet, $doc, $outfp){
      global $pageoptlist, $separator;
      /*
       * Get number of pages in the document.
       */
      $n_pages = $tet->pcos_get_number($doc, "length:pages");

      /* loop over pages */
      for ($pageno = 1; $pageno <= $n_pages; ++$pageno)
      {

          $page = $tet->open_page($doc, $pageno, $pageoptlist);

          if ($page == -1)
          {
            printf("Error ['%d] in %s() on page %d: %s\n", 
              $tet->get_errnum(), $tet->get_apiname(), $pageno, $tet->get_errmsg());
            continue; /* try next page */
          }

          /*
           * Retrieve all text fragments; This loop is actually not required
           * for granularity=page, but must be used for other granularities.
           */
          while (($text = $tet->get_text($page)) != "")
          {
              fwrite($outfp, $text); // print the retrieved text

              /* print a separator between chunks of text */
              fwrite($outfp, $separator);
          }

          if ($tet->get_errnum() != 0)
          {
            printf("Error ['%d] in %s() on page %d: %s\n", 
              $tet->get_errnum(), $tet->get_apiname(), $pageno, $tet->get_errmsg());
          }

          $tet->close_page($page);
      }
  }

  /**
   * Open a named physical or virtual file, extract the text from it, search
   * for document or page attachments, and process these recursively. Either
   * filename must be supplied for physical files, or data+length from which a
   * virtual file will be created. The caller cannot create the PVF file since
   * we create a new $tet->object here in case an exception happens with the
   * embedded document - the caller can happily continue with his $tet->object
   * even in case of an exception here.
   * 
   * @param outfp
   * @param filename
   * @param realname
   * @param data
   * 
   * @return 0 if successful, otherwise a non-null code to be used as exit
   *         status
   */
  function process_document($outfp, $filename, $realname, $data)
  {
      global $docoptlist, $globaloptlist;
      $retval = 0;
      try
      {
          $pvfname = "/pvf/attachment";

          $tet = new TET();

          /*
           * Construct a PVF file if data instead of a filename was provided
           */
          if (empty($filename))
          {
              $tet->create_pvf($pvfname, $data, "");
              $filename = $pvfname;
          }

          $tet->set_option($globaloptlist);

          $doc = $tet->open_document($filename, $docoptlist);

          if ($doc == -1)
          {
            printf("Error ['%d] in %s (source : attachment '%s'): %s\n", 
                $tet->get_errnum(), $tet->get_apiname(), $realname, $tet->get_errmsg());

              $retval = 5;
          }
          else
          {
              process_document_single($outfp, $tet, $doc);
          }

          /*
           * If there was no PVF file deleting it won't do any harm
           */
          $tet->delete_pvf($pvfname);
      }
      catch (TETException $e)
      {
          printf("Error %d in %s (source: attachment '%s'): %s\n" , $e->get_errnum(), 
                $e->get_apiname(), $realname, $e->get_errmsg());
          $retval = 1;
      }
      catch (Throwable $e) {
          die(get_class($e) . " occurred in get_attachments sample:\n" . 
                $e->getMessage() . "\n");
      }
      $tet = 0;

      return $retval;
  }

  /**
   * Process a single file.
   * 
   * @param outfp Output stream for messages
   * @param $tet->The $tet->object
   * @param doc The $tet->document handle
   */
  function process_document_single($outfp, $tet, $doc){

      // -------------------- Extract the document's own page contents
      extract_text($tet, $doc, $outfp);

      // -------------------- Process all document-level file attachments

      // Get the number of document-level file attachments.
      $filecount = $tet->pcos_get_number($doc, "length:names/EmbeddedFiles");

      for ($file = 0; $file < $filecount; $file++)
      {
          /*
           * fetch the name of the file attachment; check for Unicode file
           * name (a PDF 1.7 feature)
           */
          $objtype = $tet->pcos_get_string($doc, "type:names/EmbeddedFiles["
              . $file . "]/UF");

          if ($objtype == "string")
          {
              $attname = $tet->pcos_get_string($doc,
                  "names/EmbeddedFiles[" . $file . "]/UF");
          }
          else
          {
              $objtype = $tet->pcos_get_string($doc, "type:names/EmbeddedFiles["
                      . $file . "]/F");
  
              if ($objtype == "string")
              {
                  $attname = $tet->pcos_get_string($doc, "names/EmbeddedFiles["
                          . $file . "]/F");
              }
              else
              {
                  $attname = "(unnamed)";
              }
          }
          /* fetch the contents of the file attachment and process it */
          $objtype = $tet->pcos_get_string($doc, "type:names/EmbeddedFiles["
                  . $file . "]/EF/F");

          if ($objtype == "stream")
          {
              fwrite($outfp, "\n----- File attachment '" . $attname . "':\n");
              $attdata = $tet->pcos_get_stream($doc, "",
                      "names/EmbeddedFiles[" . $file . "]/EF/F");

              process_document($outfp, null, $attname, $attdata);
              fwrite($outfp, "----- End file attachment '" . $attname . "'\n");
          }
      }

      // -------------------- Process all page-level file attachments

      $pagecount = $tet->pcos_get_number($doc, "length:pages");

      // Check all pages for annotations of type FileAttachment
      for ($page = 0; $page < $pagecount; $page++)
      {
          $annotcount = $tet->pcos_get_number($doc, "length:pages[" . $page . "]/Annots");

          for ($annot = 0; $annot < $annotcount; $annot++)
          {
              $val = $tet->pcos_get_string($doc, "pages[" . $page . "]/Annots["
                      . $annot . "]/Subtype");

              $attname = "page " . ($page + 1) . ", annotation " . ($annot + 1);
              if ($val == "FileAttachment")
              {
                  $attpath = "pages[" . $page . "]/Annots[" . $annot . "]/FS/EF/F";
                  /*
                   * fetch the contents of the attachment and process it
                   */
                  $objtype = $tet->pcos_get_string($doc, "type:" . $attpath);

                  if ($objtype == "stream")
                  {
                      fwrite($outfp, "----- Page level attachment '" . $attname . "':\n");
                      $attdata = $tet->pcos_get_stream($doc, "", $attpath);
                      process_document($outfp, null, $attname, $attdata);
                      fwrite($outfp, "----- End page level attachment '" . $attname . "'\n");
                  }
              }
          }
      }

      $tet->close_document($doc);
  }

  
$outfp = fopen($outfilename, "w");

process_document($outfp, $infilename, $infilename, null);

fclose($outfp);
