     H DEBUG(*YES) DFTACTGRP(*NO) BNDDIR('TETLIB/TETLIB')
     H COPYRIGHT('(c) PDFlib GmbH (www.pdflib.com)')
      **************************************************************************
      *   PDFlib TET sample application.
      *
      *   PDF text extractor which also searches PDF file attachments.
      *   The file attachments may be attached to the document or
      *   to page-level annotations of type FileAttachment. The former construct
      *   also covers PDF 1.7 packages (a.k.a. PDF collections).
      *
      *   Nested attachments (file attachments within file attachments,
      *   or nested PDF packages): all embedded files are processed recursively.
      **************************************************************************
     d/copy QRPGLESRC,TETLIB
     d/copy QRPGLESRC,IFSIO
      **************************************************************************

     d outfd           s             10i 0
     d ret             s             10i 0
     d filename        s            256c   varying
     d data            s               *
     d pdfpage         s             10i 0

     D Main            PR                  ExtPgm('GET_ATTACH')
     D                              128a
     D                              128a

     D Main            PI
     D  InfileName                  128a
     D  OutfileName                 128a

      * prototypes

     d exit            pr
     d   error1                      45a   value
     d   error2                      52a   value

     d process_document...
     d                 pr            10i 0
     d outfd                         10i 0
     d filename                     256c   varying
     d                                     value
     d realname                     256c   varying
     d data                            *
     d length                        10i 0 value

     d extract_text    pr
     d tet                             *
     d doc                           10i 0
     d outfd                         10i 0

     D Translate       PR                  ExtPgm('QDCXLATE')
     D   Length                       5P 0 const
     D   Data                     32766A   options(*varsize)
     D   Table                       10A   const

     d utf8_write      pr
     d  tet                            *   value
     d  outfd                        10i 0 value
     d  out                                value like(r_text_long_u)
      **************************************************************************
      /free

        ret = 0;

        if OutfileName = *blanks;
          exit ('usage: CALL PGM(GET_ATTACH) ' :
                'PARM(''<infile>'' ''<outfile>'')');
          return;
        endif;

        outfd = open(%trim(OutfileName):
                     O_WRONLY+O_CREAT+O_TRUNC+O_EXCL:S_IRWXU+S_IRWXG);

        if (outfd < 0);
          exit ('Couldn''t open "' + %trim(outfilename) + '".' :
                '');
          return;
        endif;

        filename = %ucs2(InfileName);
        data = *NULL;
        ret = process_document(outfd : filename : filename : data : 0);

        ret = close(outfd);

        *inlr = *on;
        return;

      /end-free

     p exit            B
     d exit            PI
     d   error1                      45a   value
     d   error2                      52a   value

     d message         s             52a

      /free
        if error1 <> *blanks or error2 <> *blanks;
          message = 'Error: ' + error1;
          dsply message;

          if error2 <> *blanks;
            message = error2;
            dsply message;
          endif;
        endif;
      /end-free
     P                 E

      // Open a named physical or virtual file, extract the text from it,
      // search for document or page attachments, and process these recursively.
      // Either filename must be supplied for physical files, or data+length
      // from which a virtual file will be created.
      // The caller cannot create the PVF file since we create a new TET object
      // here in case an exception happens with the embedded document - the
      // caller can happily continue with his TET object even in case of an
      // exception here.

     P process_document...
     P                 B
     d process_document...
     d                 PI            10i 0
     d outfd                         10i 0
     d filename                     256c   varying
     d                                     value
     d realname                     256c   varying
     d data                            *
     d length                        10i 0 value

      * global option list
     d globaloptlist   c                   %ucs2('-
     d                                     searchpath={{../data} -
     d                                     {../../../resource/cmap} -
     d                                     outputformat=utf8}')
      * document-specific option list
     d docoptlist      c                   %ucs2('')

      * Variables
     d tet             s               *
     d pvfname         s            512c   varying
     d doc             s             10i 0
     d len             s             10i 0
     d file            s             10i 0
     d filecount       s             10i 0
     d objtype         s             10i 0
     d attname         s            256c   varying
     d attdata         s          32766c   varying
     d                                     based(attdata_p)
     d attlength       s             10i 0
     d res             s             10i 0
     d separator       s              1    inz(x'0a')
     d pagecount       s             10i 0
     d pagenr          s             10i 0
     d annot           s             10i 0
     d annotcount      s             10i 0
     d val             s            256c   varying

      /free
        tet = TET_new;

        if tet = *null;
          exit ('extractor: out of memory' :
                '');
          return 4;
        endif;

        //* Construct a PVF file if data instead of a filename was provided
        if (filename = %ucs2(''));
          pvfname = '/pvf/attachment';
          TET_create_pvf(tet : pvfname : data : length :
                         %ucs2(''));
          filename = pvfname;
        endif;

        TET_set_option(tet:globaloptlist);

        doc = TET_open_document(tet : %ucs2(%trim(filename)) : docoptlist);

        if (doc = -1);
          exit ('Error ' + %trim(%char(TET_get_errnum(tet))) + ' in ' +
                %char(TET_get_apiname(tet)) + '() (source: attachment ''' +
                %trim(%char(realname)) + '''): ' + %char(TET_get_apiname(tet)) :
                '');
          TET_delete(tet);
          return 5;
        endif;

        // Extract the document's own page contents
        extract_text(tet : doc : outfd);
        res = write(outfd : %addr(separator) : 1);
        res = write(outfd : %addr(separator) : 1);

        // Process all document-level file attachments

        // Get the number of document-level file attachments.
        filecount = TET_pcos_get_number(tet : doc :
                                        %ucs2('length:names/EmbeddedFiles'));
        for file = 0 to filecount - 1;

        // fetch the name of the file attachment; check for Unicode file
        // name (a PDF 1.7 feature)

          objtype = TET_pcos_get_number(
                      tet : doc :
                      %ucs2('type:names/EmbeddedFiles[' +
                            %char(file) + ']/UF'));

          if (objtype = pcos_ot_string);
            attname = TET_pcos_get_string(
                        tet : doc :
                        %ucs2('names/EmbeddedFiles[' + %char(file) + ']/UF'));
          else;
            // fetch the name of the file attachment
            objtype = TET_pcos_get_number(
                        tet : doc :
                        %ucs2('type:names/EmbeddedFiles[' +
                              %char(file) + ']/F'));

            if (objtype = pcos_ot_string);
              attname = TET_pcos_get_string(
                          tet : doc :
                          %ucs2('names/EmbeddedFiles[' + %char(file) + ']/F'));
            else;
              attname = %ucs2('(unnamed)');
            endif;
          endif;

          // fetch the contents of the file attachment and process it
          objtype = TET_pcos_get_number(
                      tet : doc :
                      %ucs2('type:names/EmbeddedFiles[' +
                            %char(file) + ']/EF/F'));

          if (objtype = pcos_ot_stream);
            utf8_write(tet : outfd : %ucs2('----- File attachment '''));
            utf8_write(tet : outfd : attname);
            utf8_write(tet : outfd : %ucs2(''':'));
            res = write(outfd : %addr(separator) : 1);

            attdata_p = TET_pcos_get_stream(
                          tet : doc : attlength : %ucs2('') :
                          %ucs2('names/EmbeddedFiles[' +
                                %char(file) + ']/EF/F'));

            res = process_document(outfd : %ucs2('') : attname :
                                   attdata_p : attlength);
            utf8_write(tet : outfd : %ucs2('----- End file attachment '''));
            utf8_write(tet : outfd : attname);
            utf8_write(tet : outfd : %ucs2(''':'));
            res = write(outfd : %addr(separator) : 1);
          endif;
        endfor;

        // -------------------- Process all page-level file attachments
        pagecount = TET_pcos_get_number(tet : doc : %ucs2('length:pages'));

        // Check all pages for annotations of type FileAttachment

        for pagenr = 0 to pagecount - 1;
          annotcount = TET_pcos_get_number(
                         tet : doc :
                         %ucs2('length:pages[' +
                                %char(pagenr) + ']/Annots'));

          for annot = 0 to annotcount - 1;
            val = TET_pcos_get_string(tet : doc :
                                      %ucs2('pages[' +
                                            %char(pagenr) + ']/Annots[' +
                                            %char(annot) + ']/Subtype'));
            attname = 'page ' + %char(pagenr + 1) +
                      ', annotation ' + %char(annot + 1);

            if (%trim(val) = 'FileAttachment');
            // fetch the contents of the attachment and process it
              objtype = TET_pcos_get_number(
                          tet : doc :
                          %ucs2('type:pages[' +
                                %char(pagenr) + ']/Annots[' +
                                %char(annot) + ']/FS/EF/F'));

              if (objtype = pcos_ot_stream);
                attdata_p = TET_pcos_get_stream(
                              tet : doc : attlength : %ucs2('') :
                              %ucs2('pages[' +
                                %char(pagenr) + ']/Annots[' +
                                %char(annot) + ']/FS/EF/F'));

                res = process_document(outfd : %ucs2('') :
                                       attname : attdata_p : attlength);
              endif;
            endif;
          endfor;
        endfor;

        TET_close_document(tet : doc);

        // If there was no PVF file deleting it won't do any harm */
        TET_delete_pvf(tet : pvfname);

        TET_delete(tet);

        return 0;
      /end-free
     P                 E

      // Extract text from a document for which a TET handle is already available
     p extract_text    B
     d extract_text    PI
     d tet                             *
     d doc                           10i 0
     d outfd                         10i 0

      * page-specific option list
     d pageoptlist     c                   %ucs2('granularity=page')

     d n_pages         s             10i 0
     d pageno          s             10i 0
     d len             s             10i 0
     d res             s             10i 0
      * separator to emit after each chunk of text. This depends on the
      * application's needs; for granularity=word a space character may be useful.
     d separator       c                   x'0a'
     d text            s          32766c   based(text_p)

      /free

        // get number of pages in the document
        n_pages=TET_pcos_get_number(tet : doc : %ucs2('length:pages'));

        // loop over pages in the document
        for pageno = 1 to n_pages;
          pdfpage = TET_open_page(tet : doc : pageno : pageoptlist);

          if pdfpage = -1;
            exit ('Error ' + %trim(%char(TET_get_errnum(tet))) + ' in ' +
                  %char(TET_get_apiname(tet)) + '() on page ' + %char(pageno) +
                  ': ' + %char(TET_get_errmsg(tet)) :
                  '');
            iter;  // try next page
          endif;

          // Retrieve all text fragments; This loop is actually not required
          // for granularity=page, but must be used for other granularities.
          text_p = TET_get_text(tet : pdfpage : len);
          dow (text_p <> *NULL);
            res = write(outfd : text_p : %len(%str(text_p)));
            text_p = TET_get_text(tet : pdfpage : len);
          enddo;

          if (TET_get_errnum(tet) <> 0);
            exit ('Error ' + %trim(%char(TET_get_errnum(tet))) + ' in ' +
                  %char(TET_get_apiname(tet)) + '() on page ' + %char(pageno) +
                  ': ' + %char(TET_get_errmsg(tet)) :
                  '');
          endif;

          TET_close_page(tet : pdfpage);
        endfor;
      /end-free
     P                 E
      *********************************************************************************************
     p utf8_write      b

     d utf8_write      pi
     d  tet                            *   value
     d  outfd                        10i 0 value
     d  out                                value like(r_text_long_u)

     d out_fix         s          16383c
     d rtn_value       s          32767    based(rtn_value_p)
     d len_out         s             10i 0
     d rtnlen          s             10i 0
     d convert_value   s          65535    based(convert_value_p)
     d result          s             10i 0

      /free
          out_fix = out;
          rtn_value_p = %addr(out_fix);
          len_out = %len(out);

          convert_value_p = TET_convert_to_unicode (
                              tet :
                              %ucs2('utf16') :
                              rtn_value :
                              len_out * 2 :
                              rtnlen :
                              %ucs2('outputformat=utf8'));

          result = write(outfd : convert_value_p : len_out);
      /end-free
     p utf8_write      e