#!/usr/bin/env ruby
#
# PDF text extractor which also searches PDF file attachments.
#

require 'TET'

#Global option list.
$globaloptlist = "searchpath={{../data} " +   "{../../../resource/cmap}}"

# Document specific option list.
$docoptlist = ""

# Page-specific option list.
$pageoptlist = "granularity=page"

# Separator to emit after each chunk of text. This depends on the
# application's needs for granularity=word a space character may be
# useful.
$separator = "\n"

# Extract text from a document for which a tet->handle is already available.
# @param tet
#            The tet->object
# @param doc
#            A valid tet->document handle
# @param outfp
#            Output file handle

def extract_text(tet, doc, outfp)
      
      # Get number of pages in the document.

      n_pages = tet.pcos_get_number(doc, "length:pages")

      # loop over pages 
      1.upto(n_pages) do |pageno|

          page = tet.open_page(doc, pageno, $pageoptlist)

          if (page == -1)
            printf("Error ['%d] in %s() on page %d: %s\n", 
              tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg())
            continue # try next page 
          end

          # Retrieve all text fragments This loop is actually not required
          # for granularity=page, but must be used for other granularities.
          while ((text = tet.get_text(page)) != nil)
              outfp.print text # print the retrieved text

              # print a separator between chunks of text 
              outfp.print $separator
          end

          if (tet.get_errnum() != 0)
            printf("Error ['%d] in %s() on page %d: %s\n", 
              tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg())
          end

          tet.close_page(page)
      end
  end

  
  # Open a named physical or virtual file, extract the text from it, search
  # for document or page attachments, and process these recursively. Either
  # filename must be supplied for physical files, or data+length from which a
  # virtual file will be created. The caller cannot create the PVF file since
  # we create a new tet.object here in case an exception happens with the
  # embedded document - the caller can happily continue with his tet.object
  # even in case of an exception here.
  # 
  # @param outfp
  # @param filename
  # @param realname
  # @param data
  # 
  # @return 0 if successful, otherwise a non-null code to be used as exit
  #         status
  
  def process_document(outfp, filename, realname, data)
      retval = 0
      begin
          pvfname = "/pvf/attachment"
         
          tet = TET.new

          # Construct a PVF file if data instead of a filename was provided
          if (filename == nil)
              tet.create_pvf(pvfname, data, "")
              filename = pvfname
          end

          tet.set_option($globaloptlist)

          doc = tet.open_document(filename, $docoptlist)

          if (doc == -1)
            printf("Error ['%d] in %s (source : attachment '%s'): %s\n", 
                tet.get_errnum(), tet.get_apiname(), realname, tet.get_errmsg())

              retval = 5
          else
            process_document_single(outfp, tet, doc)
          end

          # If there was no PVF file deleting it won't do any harm
          
          tet.delete_pvf(pvfname)

      rescue TETException => pe
          print pe.backtrace.join("\n") + "\n"
          print "Error [" + pe.get_errnum.to_s + "] " + pe.get_apiname + \
                                ": " + pe.get_errmsg + "\n"
      rescue Exception => e
          print e.backtrace.join("\n") + "\n" + e.to_s + "\n"
      ensure
          tet.delete() if tet
      end
      return retval
  end

  # Process a single file.
  # 
  # @param outfp Output stream for messages
  # @param tet.The tet.object
  # @param doc The tet.document handle
  
  def process_document_single(outfp, tet, doc)

      #-------------------- Extract the document's own page contents
      extract_text(tet, doc, outfp)

      #-------------------- Process all document-level file attachments

      # Get the number of document-level file attachments.
      filecount = tet.pcos_get_number(doc, "length:names/EmbeddedFiles")

      0.upto(filecount-1) do |filen|
          # fetch the name of the file attachment check for Unicode file
          # name (a PDF 1.7 feature)

          objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles[" + filen.to_s + "]/UF")

          if (objtype == "string")
              attname = tet.pcos_get_string(doc,
                  "names/EmbeddedFiles[" + filen.to_s + "]/UF")
          else
              objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles[" + filen.to_s + "]/F")
  
              if (objtype == "string")

                  attname = tet.pcos_get_string(doc, "names/EmbeddedFiles[" + filen.to_s + "]/F")
              else

                  attname = "(unnamed)"
              end
          end
          # fetch the contents of the file attachment and process it
          objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles[" + filen.to_s + "]/EF/F")

          if (objtype == "stream")
              outfp.print "\n----- File attachment '" + attname.to_s + "':\n"
              attdata = tet.pcos_get_stream(doc, "",
                      "names/EmbeddedFiles[" + filen.to_s + "]/EF/F")

              process_document(outfp, nil, attname, attdata)
              outfp.print "----- End file attachment '" + attname.to_s + "'\n"
          end
      end

      # -------------------- Process all page-level file attachments

      pagecount = tet.pcos_get_number(doc, "length:pages")

      # Check all pages for annotations of type FileAttachment
      
      0.upto(pagecount-1) do |page|
          annotcount = tet.pcos_get_number(doc, "length:pages[" + page.to_s + "]/Annots")

          0.upto(annotcount-1) do |annot|
            val = tet.pcos_get_string(doc, "pages[" + page.to_s + "]/Annots[" + annot.to_s + "]/Subtype")

              attname = "page " + (page + 1).to_s + ", annotation " + (annot + 1).to_s
              if (val == "FileAttachment")
                  attpath = "pages[" + page.to_s + "]/Annots[" + annot.to_s + "]/FS/EF/F"
                  # fetch the contents of the attachment and process it
                  
                  objtype = tet.pcos_get_string(doc, "type:" + attpath.to_s)

                  if (objtype == "stream")
                      outfp.print "----- Page level attachment '" + attname.to_s + "':\n"
                      attdata = tet.pcos_get_stream(doc, "", attpath)
                      process_document(outfp, nil, attname, attdata)
                      outfp.print "----- End page level attachment '" + attname.to_s + "'\n"
                  end
              end
          end
      end

      tet.close_document(doc)
  end

begin
    if (ARGV.length != 2)
      raise "usage: get_attachments.rb <infilename> <outfilename>"
    end
  
outfp = File.new(ARGV[1], "w")  

process_document(outfp, ARGV[0], ARGV[0], nil)


end
    