/* LineGroup.java
 *
 * created: Mon Oct 12 1998
 *
 * This file is part of Artemis
 *
 * Copyright (C) 1998,1999,2000  Genome Research Limited
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

package uk.ac.sanger.artemis.io;

import java.io.Writer;
import java.io.IOException;
import java.util.Hashtable;

import uk.ac.sanger.artemis.util.LinePushBackReader;

/**
 *  This class corresponds to a group of associated lines in an EMBL entry.
 *  An example of a group of associated lines is all the lines in an entry
 *  that start with FT.
 *
 *  @author Kim Rutherford
 */

abstract class LineGroup
    extends EMBLObject
{

  /**
   *  The tag used for unidentified input.
   **/
  final static private int UNKNOWN = 0;
                                                                                                
  /**
   *  The tag for the end of entry line: "//"
   **/
  final static int END_OF_ENTRY = 1;
  final static String END_OF_ENTRY_STRING = "//";
                                                                                                
  /**
   *  The tag for the start of sequence line
   **/
  final static int SEQUENCE = 2;
  final static String EMBL_SEQUENCE_STRING = "SQ";
                                                                                                
  /**
   *  The tag for an EMBL feature table line
   **/
  final static int EMBL_FEATURE = 3;
  final static String EMBL_FEATURE_STRING = "FT";
                                                                                                
  /**
   *  The tag for an EMBL feature header lines (FH ...)
   **/
  final static int EMBL_FEATURE_HEADER = 4;
  final static String EMBL_FEATURE_HEADER_STRING = "FH";
                                                                                                
  /**
   *  The tag for a GENBANK feature table line
   **/
  final static int GENBANK_FEATURE = 5;
                                                                                                
  /**
   *  This is the tag for an EMBL LineGroup that we don't have a handler for.
   *  It will be stored in an object of type EmblMisc.
   **/
  final static int EMBL_MISC = 6;

  /**
   *  This is the tag for an Genbank LineGroup that we don't have a handler
   *  for.  It will be stored in an object of type GenbankMisc.
   **/
  final static int GENBANK_MISC = 7;
                                                                                                
  /**
   *  This is the tag for a GFF LineGroup (generally a comment line) that we
   *  don't have a handler for.  It will be stored in an object of type
   *  GFFMisc.
   **/
  final static int GFF_MISC = 8;
                                                                                                
  /**
   *  This is the tag for a GFF format line.
   **/
  final static int GFF_FEATURE = 9;
                                                                                                
  /**
   *  This is the tag for lines generated by MSPcrunch -d
   **/
  final static int MSPCRUNCH_FEATURE = 10;
                                                                                                
  /**
   *  This is the tag for lines generated by blast
   **/
  final static int BLAST_FEATURE = 11;
                                                                                                
  /**
   *  The tag for files that look like binary.
   **/
  final static int BINARY_CHARACTERS = 12;
                                                                                                
  /**
   *  The tag for BSML XML files.
   **/
  final static int BSML_XML = 13;

  /**
   *  The tag for AGAVE XML files.
   **/
  final static int AGAVE_XML = 14;
                                                                                                
  /**
   *  The tag for GAME XML files.
   **/
  final static int GAME_XML = 15;

  /**
   *  This hash table contains the GENBANK start of line keywords (LOCUS,
   *  DEFINITION, FEATURES etc.)
   **/
  private static Hashtable<String, String> genbank_hash = null;
                                                                                                
  static 
  {
    genbank_hash = new Hashtable<String, String> ();
    genbank_hash.put ("LOCUS","LOCUS");
    genbank_hash.put ("DEFINITION","DEFINITION");
    genbank_hash.put ("ACCESSION","ACCESSION");
    genbank_hash.put ("NID","NID");
    genbank_hash.put ("VERSION","VERSION");
    genbank_hash.put ("KEYWORDS","KEYWORDS");
    genbank_hash.put ("SOURCE","SOURCE");
    genbank_hash.put ("REFERENCE","REFERENCE");
    genbank_hash.put ("PROJECT","PROJECT");
    genbank_hash.put ("COMMENT","COMMENT");
    genbank_hash.put ("FEATURES","FEATURES");
    genbank_hash.put ("SEGMENT","SEGMENT");
    genbank_hash.put ("PRIMARY","PRIMARY");
    genbank_hash.put ("DBLINK","DBLINK");
    genbank_hash.put ("DBSOURCE","DBSOURCE");
    genbank_hash.put ("CONTIG","CONTIG");
  }

  /**
   *  Try to read and return a new LineGroup object from a stream.
   *  @param reader The stream to read from.
   *  @return A new LineGroup object or null if stream is at the end of file.
   *  @exception IOException Thrown if exception occurs while reading.
   *  @exception ReadFormatException Thrown if the format of the input is in
   *    error.
   *  @exception InvalidRelationException Thrown if this Feature cannot contain
   *    a particular Qualifier.
   **/
  protected static LineGroup readNextLineGroup (final LinePushBackReader reader, final Entry entry)
      throws IOException, InvalidRelationException 
  {

    String line;

    // read until we get to a non-blank line
    LINES: while(true) 
    {
      line = reader.readLine ();

      if(line == null) 
        return null; // end of file

      // check for and ignore blank lines
      for(int i = 0 ; i < line.length () ; ++i) 
      {
        final char letter = line.charAt (i);

        if(letter != ' ' && letter != '\t') 
          break LINES;
      }
    }

    final int line_type = LineGroup.getLineType (line);

    reader.pushBack (line);

    switch (line_type) 
    {
      case SEQUENCE:
        return StreamSequenceFactory.makeStreamSequence (reader, entry);

      case EMBL_FEATURE:
        return EmblStreamFeature.readFromStream (reader);

      case EMBL_FEATURE_HEADER:
        return new FeatureHeader (reader);

      case GENBANK_FEATURE:
        return GenbankStreamFeature.readFromStream (reader);

      case GFF_FEATURE:
        return GFFStreamFeature.readFromStream (reader);

      case BLAST_FEATURE:
        return BlastStreamFeature.readFromStream (reader);

      case MSPCRUNCH_FEATURE:
        return MSPcrunchStreamFeature.readFromStream (reader);

      case END_OF_ENTRY:
        // in this case we do want to read the line (which will be //) so that
        // the next call to readNextEntry () starts on the next entry
        reader.readLine ();
        return null;

      case EMBL_MISC:
        return new EmblMisc (reader);

      case GENBANK_MISC:
        return new GenbankMisc (reader);

      case GFF_MISC:
        return new GFFMisc (reader);

      case BINARY_CHARACTERS:
        throw new ReadFormatException ("cannot recognise format of binary file");

      default:
        throw new ReadFormatException ("reader got confused - " +
                                       "unknown line type",
                                       reader.getLineNumber ());
    }
  }

  /**
   *  Return the embl line type of the line contained in the argument String.
   */
  protected static int getLineType(final String line)
  {
    if(line.startsWith ("<?xml")) 
      return GAME_XML;

    if(line.startsWith ("#")) 
      return GFF_MISC;

    if(line.length () >= 2 &&
       (line.charAt (0) == '/' || Character.isLetter (line.charAt (0))) &&
       (line.charAt (1) == '/' || Character.isLetter (line.charAt (1))) &&
       (line.length () == 2 ||
        line.length () == 3 && line.endsWith (" ") ||
        line.length () == 4 && line.endsWith ("  ") ||
        (line.length () >= 5 && line.substring (2,5).equals ("   ") || 
         line.substring (2,5).equals (" * ")) ))    // EMBL pre-submission line
    {

      if(line.startsWith(EMBL_FEATURE_STRING)) 
        return EMBL_FEATURE;

      if(line.startsWith(END_OF_ENTRY_STRING)) 
        return END_OF_ENTRY;

      if(line.startsWith(EMBL_SEQUENCE_STRING)) 
        return SEQUENCE;

      if(line.startsWith(EMBL_FEATURE_HEADER_STRING)) 
        return EMBL_FEATURE_HEADER;

      // this covers all the lines in the header
      return EMBL_MISC;
    }

    if(line.length () > 21 &&
        ((line.startsWith ("     ") &&
          (Character.isLetter (line.charAt (5)) ||
           Character.isDigit (line.charAt (5)) ||
           line.charAt (5) == '-') &&
          line.charAt (20) == ' ') ||
         (line.startsWith ("                    ") &&
          line.trim ().length () > 0))) 
      return GENBANK_FEATURE;

    final int genbank_type = getGenbankType(line);

    if(genbank_type != UNKNOWN) 
      return GENBANK_MISC;
    
    if(isGFFLine(line))
      return GFF_FEATURE;

    if(isBlastLine(line))
      return BLAST_FEATURE;

    if(isMSPcrunchLine(line)) 
      return MSPCRUNCH_FEATURE;

//  if(isBlastLine(line)) 
//    return BLAST_FEATURE;

    if(looksLikeBinary(line)) 
      return BINARY_CHARACTERS;

    // default is sequence
    return SEQUENCE;
  }

  /**
   *  Return true if and only if the argument contains more than 30% binary
   *  characters.  "binary" means a control character before space in ascii
   *  (except for tab, new line and form feed) and characters with the high
   *  bit set.  This is supposed to approximate the Perl -B test.
   **/
  private static boolean looksLikeBinary (final String line) 
  {
    int count = 0;

    if(line.length () == 0) 
      return false;

    for(int i = 0 ; i < line.length () ; ++i) 
    {
      final char this_char = line.charAt (i);

      if (Character.isISOControl (this_char) &&
          this_char != '\t' &&
          this_char != ' ' &&
          this_char != '\r' &&
          this_char != '\n' ||
          this_char >= 128) {
        ++count;
      }
    }

    if (count * 100 / line.length () >= 30) {
      return true;
    } else {
      return false;
    }
  }

  /**
   *  Return true if and only if the given String appears to be a feature
   *  generated by MSPcrunch -d
   **/
  private static boolean isMSPcrunchLine (final String line) 
  {
    final String trim_line = line.trim ();

    if (trim_line.length () > 0 &&
        Character.isDigit (trim_line.charAt (0)) &&
        trim_line.indexOf (' ') != -1) 
      return true;
    else 
      return false;
  }

  /**
   *  Return true if and only if the given String appears to be a feature
   *  generated by blast.  This method is easily fooled.
   **/
  private static boolean isBlastLine (final String line) 
  {
    if (line.length () > 0 && countChars (line, '\t') == 11) 
      return true;
    else 
      return false;
  }

  /**
   *  Return true if and only if the given String appears to be a GFF feature.
   *  This method is easily fooled.
   **/
  private static boolean isGFFLine (final String line) 
  {
    if (line.length () > 0) 
    {
      final int tab_count = countChars (line.trim (), '\t');

      if (tab_count == 7 || tab_count == 8 || tab_count == 9 ||
          tab_count == 10) 
        return true;
    }
    return false;
  }

  /**
   *  Return the number of occurrences of the character c in the String s.
   **/
  private static int countChars (final String s, final char c)
  {
    int count = 0;
    int index = 0;
    while((index = s.indexOf(c, index)) > -1)
    {
      count++;
      index++;
    }
    return count;
  }

  /**
   *  Return the type of GENBANK LineGroup that starts with the given String
   *  or UNKNOWN if the String isn't the
   **/
  private static int getGenbankType (final String line) 
  {
    if (line.length () > 0 && Character.isLetter (line.charAt (0))) 
    {
      final int first_space = line.indexOf (' ');
      if (first_space == -1) 
      {
        if (genbank_hash.get (line) != null) 
          return GENBANK_MISC;
      }
      else
      {
        final String first_word = line.substring (0, first_space);

        if (genbank_hash.get (first_word) != null) 
          return GENBANK_MISC;
      }
    }

    return UNKNOWN;
  }

  /**
   *  Returns a String containing the contents of the line with the initial
   *  type string (two letters) and white space (three spaces) removed.
   */
  public static String getRestOfLine (String line) 
  {
    final int END_OF_SPACES = 5;

    if (line.length () > END_OF_SPACES) 
      return line.substring (END_OF_SPACES);
    else 
      return "";
  }

  /**
   *  Write the end of entry marker - "//".
   **/
  public static void writeEndOfEMBLEntry (Writer writer) throws IOException 
  {
    writer.write (END_OF_ENTRY_STRING + "\n");
  }

  public static void writeStartOfGFFEntry (Writer writer) throws IOException
  {
    writer.write ("##FASTA\n");
  }


  /**
   *  Write this object to the given stream.
   *  @param writer The stream to write to.
   **/
  public abstract void writeToStream (final Writer out_stream)
      throws IOException;

}
