/***************************************************************************
 *   Copyright (C) 2005 by Andreas Pokorny                                 *
 *   andreas.pokorny@biozentrum.uni-wuerzburg.de                           *
 *                                                                         *
 *   This file is part of profdist and cbcanalyzer                         *
 *                                                                         *
 *   Both profdist and cbcanalyzer are free software; you can redistribute * 
 *   it and/or modify it under the terms of the GNU General Public License * 
 *   as published by the Free Software Foundation; either version 2 of the * 
 *   License, or (at your option) any later version.                       *
 *                                                                         *
 *   Profdist and cbcanalyzer are distributed in the hope that it will be  *
 *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty   *
 *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/
// #define BOOST_SPIRIT_DEBUG

// #define BOOST_SPIRIT_DEBUG_FLAGS BOOST_SPIRIT_DEBUG_FLAGS_NODES
#include <list>
#include <stdexcept>
#include <string>
#include <cassert>
#include <sstream>
#include <boost/cstdlib.hpp>
#include <boost/spirit/phoenix.hpp>
#include <boost/spirit/core.hpp>
#include <boost/spirit/symbols.hpp>
#include <boost/spirit/attribute.hpp>
#include <boost/spirit/dynamic.hpp>
#include <boost/spirit/core/primitives/primitives.hpp>
#include <boost/spirit/utility/grammar_def.hpp>
#include <boost/spirit/utility/lists.hpp>
#include <boost/spirit/iterator/file_iterator.hpp>
#include <boost/spirit/iterator/position_iterator.hpp>
#include <boost/spirit/core/composite/epsilon.hpp>
#include <boost/spirit/actor/push_back_actor.hpp>
#include <boost/spirit/actor/clear_actor.hpp>
#include <boost/spirit/actor/increment_actor.hpp>

#include "parser.h"
#include "spirit_helper.h"

using namespace boost;
using namespace phoenix;

/**
 * Simple embl_grammer, this grammar generates a list of sequence info 
 * items, a lot of information is skipped. 
 */
struct embl_grammar
:   public boost::spirit::grammar<embl_grammar >
{
  std::list<file_sequence> & entries;
  embl_grammar( std::list<file_sequence> & entries ) : entries(entries) {}
  
  /**
   * grammar definition
   */
  template<typename ScannerT>
  struct definition
  {
    typedef boost::spirit::rule<ScannerT> rule_t;

    /**
     * behind_id_rule_t defines all rules that parse stuff behind ids like ID, CC, SQ
     */ 
    typedef boost::spirit::stored_rule<ScannerT, boost::spirit::parser_context<int> > behind_id_rule_t;

    /**
     * closure that stores the id of the next parser, to accept the current line
     */
    struct line_closure;
    typedef boost::spirit::closure< line_closure, typename behind_id_rule_t::alias_t > closure_base_t;

    struct line_closure : closure_base_t
    {           
      typename closure_base_t::member1 line_rule;
    };

    // block rule type
    typedef boost::spirit::rule<ScannerT, typename line_closure::context_t> line_rule_t;

    line_rule_t lines; ///< Starting rule, reads several lines
    boost::spirit::chset<> classifier_c;
    behind_id_rule_t skip_line, sequence_entry, end_sequence, id_line, classification_line;
    boost::spirit::symbols<behind_id_rule_t> base; ///< This symbol table stores all embl line ids, and the associated parser for that kind of line.

    file_sequence::string_range current_range; ///< temporary storage for list entries 
    file_sequence current_sequence; ///< Sequence entry which gets parsed at the moment,


    definition(embl_grammar const& self)
    {
      using boost::spirit::blank_p;
      using boost::spirit::assign_a;
      using boost::spirit::anychar_p;
      using boost::spirit::list_p;
      using boost::spirit::eps_p;
      using boost::spirit::ch_p;
      using boost::spirit::chset;
      using boost::spirit::push_back_a;
      using boost::spirit::graph_p;
      using boost::spirit::increment_a;
      using boost::spirit::clear_a;
      using boost::spirit::alpha_p;
      using boost::spirit::uint_p;

      classifier_c = ~chset<>("\n\r;");


      lines =
        +(            // empty files are not accepted  FIXTHAT?
            base[ lines.line_rule = arg1 ] // set next parser
            >> *blank_p
            >> lazy_p(lines.line_rule) // call dynamic parser
            >> *blank_p 
            >> *lf_p
         )
        ;

      skip_line = 
        *( graph_p | ' ' | '\t' )
        ;

      id_line =  
        ( *( graph_p | ' ' | '\t' ) )[ assign_a( current_sequence.id )]
        ;

      classification_line = 
        *blank_p
        >> list_p( 
            (+classifier_c)[ assign_a( current_range )][ push_back_a(current_sequence.classification, current_range) ]
            , *blank_p 
            >> ';'
            >> *blank_p 
            )
        >> *blank_p 
        >> *(
            ch_p(';') 
            | '.'
            )
        ;

      sequence_entry = 
        skip_line  
        >> *lf_p
        >> *blank_p 
        >> +( 
            +( 
              (+((alpha_p|'-') [increment_a( current_sequence.sequence_length ) ] ) )
              [assign_a( current_range )]
              [push_back_a(current_sequence.sequence_data, current_range ) ] 
              | blank_p 
             ) 
            >> !(uint_p)
            >> *lf_p
            )
        ;
        
      end_sequence = 
        eps_p
        [ push_back_a(self.entries, current_sequence)]
        [ clear_a(current_sequence) ]
        ;

      base.add
        ("ID", id_line ) // always the first line; format is 
        // ID   entryname  dataclass; [circular] molecule; division; sequencelength BP
        ("AC", skip_line ) //  The AC (ACcession number) line lists the accession numbers 
        // associated with the entry. example: AC   X56734; S46826; 
        ("SV", skip_line ) // The SV (Sequence Version) line details the sequence version
        //  
        ("DT", skip_line) // The DT (DaTe) line shows when an entry first appeared 
        // in the database and when it was last updated.
        // DT   DD-MON-YYYY (Rel. #, Created)
        // DT   DD-MON-YYYY (Rel. #, Last updated, Version #)
        ("DE", skip_line) // The DE (Description) lines contain general descriptive 
        // information about the sequence stored. Without a specific format
        ("KW", skip_line) //  The KW (KeyWord) lines provide information which can be used to 
        // generate cross-reference indexes of the sequence entries based 
        // on functional, structural, or other categories deemed important. 
        // KW   keyword[; keyword ...].
        // so separated by ';' and stoped by '.'
        ("OS", skip_line) // Organism Species line 
        ("OC", classification_line) // Organism Classification  .. like KW
        ("RC", skip_line) // Reference lines 
        ("RP", skip_line)
        ("RX", skip_line)
        ("RG", skip_line)
        ("RA", skip_line)
        ("RT", skip_line)
        ("RN", skip_line)
        ("RL", skip_line)
        ("NI", skip_line) // Not document?!, found in example 
        ("DR", skip_line) // Database Cross-reference - a list like KW
        ("AH", skip_line) // Third Party Annotation
        ("AS", skip_line) // ASsembly Information
        ("CO", skip_line) // Con(structed) or Con(tig) sequences in the CON database 
        ("FH", skip_line) // Feature Header
        ("FT", skip_line) // Feature Table
        ("SQ", sequence_entry ) // SeQuence header
        // summary of contents:
        // SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 
        ("CC", skip_line) // free text comments about the entry,
        ("XX", skip_line) // line contains no data or comments
          ("//", end_sequence) // terminator
          ;
#ifdef BOOST_SPIRIT_DEBUG
      BOOST_SPIRIT_DEBUG_RULE(lines);
      BOOST_SPIRIT_DEBUG_RULE(skip_line);
      BOOST_SPIRIT_DEBUG_RULE(sequence_entry);
      BOOST_SPIRIT_DEBUG_RULE(end_sequence);
      BOOST_SPIRIT_DEBUG_RULE(id_line);
      BOOST_SPIRIT_DEBUG_RULE(classification_line);
#endif
    }

    line_rule_t const& start() const
    {
      return lines;
    }
  };
};

//-----------------------------------------------------------------------------
void parse_embl( std::string const& filename, std::list<file_sequence>& sequences )
{
  using namespace boost::spirit;
  typedef char char_t;
  typedef position_iterator<file_iterator<char_t> > iterator_t;
  //typedef position_iterator<<char_t> > iterator_t;
  file_iterator<char_t> file_handle( filename );

  if (!file_handle)
    throw std::runtime_error( ("Unable to open file " + filename) );

  
  iterator_t first( file_handle, file_handle.make_end(), filename );
  iterator_t end( file_handle.make_end(), file_handle.make_end(), filename );


  embl_grammar grammar( sequences ); 
#ifdef BOOST_SPIRIT_DEBUG
  BOOST_SPIRIT_DEBUG_NODE(grammar);
#endif 
  // Define your rule

  boost::spirit::parse_info<iterator_t> info = boost::spirit::parse( first, end, grammar);
  if( !info.full )
  {
    std::ostringstream out;
    out << "Parsing failed at line " << info.stop.get_position().line << " and column " 
      << info.stop.get_position().column << " in file " << info.stop.get_position().file 
      << '\n' << sequences.size() << " Sequences were found.\nError happened near:\n[...]" 
      << std::string( std::max( first, info.stop - 20 ), info.stop) << '\n';
    throw std::runtime_error( out.str() );
  }
}

//-----------------------------------------------------------------------------

#ifdef BUILD_EMBL_TEST
#include <iostream>
#include <fstream>
#include <sstream>
int main( int argc, char ** argv )
{
 
  if( argc != 1 )
  {
    using boost::spirit::file_iterator;
    using boost::spirit::position_iterator;
    typedef char char_t;
    typedef position_iterator<file_iterator<char_t> > iterator_t;
    file_iterator<char_t> file_handle( argv[1]);

    typedef boost::spirit::position_iterator<boost::spirit::file_iterator<char> > it;

    iterator_t first( file_handle, file_handle.make_end(), argv[1]);
    iterator_t last( file_handle.make_end(), file_handle.make_end(), argv[1]);

    if( ! file_handle )
    {
       std::cout << "Unable to open file!\n";

       // Clean up, throw an exception, whatever
       return -1;
    }

    std::list<file_sequence> seqs;
    embl_grammar grammar(seqs);
#ifdef BOOST_SPIRIT_DEBUG
    BOOST_SPIRIT_DEBUG_NODE(grammar);
#endif

    // Define your rule

    boost::spirit::parse_info<iterator_t> info = boost::spirit::parse(   first,  last,  grammar  );

    if( info.hit && !info.full )
    {
      std::cout << " hit " << std::string(first, info.stop) << "<-- matched up to this point!" <<  std::endl;
    }
  
    if( info.full || info.hit )
    {
      for( std::list<file_sequence>::const_iterator it = seqs.begin(), e = seqs.end();
          it != e; ++it)
      {
        std::cout << "ID:" << std::string(it->id.first, it->id.second) << std::endl;
        std::cout << "CLASSIFCATION:" << std::endl;
        for( std::list<file_sequence::string_range>::const_iterator _it = it->classification.begin(),
            _e = it->classification.end();
            _it != _e; ++_it )
          std::cout << '|' << std::string(_it->first, _it->second ) << "|";
        std::cout << std::endl << "SEQUENCE:" << std::endl;
        for( std::list<file_sequence::string_range>::const_iterator _it = it->sequence_data.begin(),
            _e = it->sequence_data.end();
            _it != _e; ++_it )
          std::cout << '|' << std::string(_it->first, _it->second ) << "|";
        std::cout << std::endl;
      }
    }
    else 
    {
      std::cout << "Did not work!" << std::endl; 
    }
  }
  else {
    std::cout << "Add a file name to the command to test the embl parser" << std::endl;
    
  }

}

#endif // BUILD_EMBL_TEST

