/* File "scanner.c":
 * Defines lexical analysis of malaga source files. */

/* This file is part of Malaga, a system for Left Associative Grammars.
 * Copyright (C) 1995-1998 Bjoern Beutel
 *
 * Bjoern Beutel
 * Universitaet Erlangen-Nuernberg
 * Abteilung fuer Computerlinguistik
 * Bismarckstrasse 12
 * D-91054 Erlangen
 * e-mail: malaga@linguistik.uni-erlangen.de 
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

/* includes =================================================================*/

#include <string.h>
#include <stdio.h>
#include "basic.h"
#include "files.h"

#undef GLOBAL
#define GLOBAL

#include "scanner.h"

/* constants ================================================================*/

#define INCLUDE_LEVEL_MAX 10 /* maximum number of nested includes */

LOCAL struct { string_t name; short_t code; } keywords[NUMBER_OF_KEYWORDS] = 
/* list of all keywords and their token codes
 * (this list must be maintained in alphabetical order) */
{
  { "accept", TOK_ACCEPT },
  { "allo_rule", TOK_ALLO_RULE },
  { "and", TOK_AND },
  { "assert", TOK_ASSERT },
  { "choose", TOK_CHOOSE },
  { "combi_rule", TOK_COMBI_RULE },
  { "define", TOK_DEFINE },
  { "else", TOK_ELSE },
  { "elseif", TOK_ELSEIF },
  { "end", TOK_END },
  { "end_rule", TOK_END_RULE },
  { "error", TOK_ERROR },
  { "fail", TOK_FAIL },
  { "filter_rule", TOK_FILTER_RULE },
  { "foreach", TOK_FOREACH },
  { "greater", TOK_GREATER },
  { "greater_equal", TOK_GREATER_EQUAL },
  { "if", TOK_IF },
  { "in", TOK_IN },
  { "include", TOK_INCLUDE },
  { "initial", TOK_INITIAL },
  { "input_rule", TOK_INPUT_RULE },
  { "less", TOK_LESS },
  { "less_equal", TOK_LESS_EQUAL },
  { "matches", TOK_MATCHES },
  { "not", TOK_NOT },
  { "or", TOK_OR },
  { "parallel", TOK_PARALLEL },
  { "pruning_rule", TOK_PRUNING_RULE },
  { "require", TOK_REQUIRE },
  { "result", TOK_RESULT },
  { "return", TOK_RETURN },
  { "robust_rule", TOK_ROBUST_RULE },
  { "rules", TOK_RULES },
  { "subrule", TOK_SUBRULE },
  { "then", TOK_THEN }
};

/* types ====================================================================*/

typedef struct /* a source stream for lexical analysis */
{
  FILE *stream;           /* the input stream for this include level */
  string_t file_name;     /* the name of the input file */
  long_t file_name_index; /* index of input file name in a string pool */
  long_t column;          /* column that has been read */
  long_t line_number;     /* number of the line that has been read */
} source_t;

/* variables ================================================================*/

LOCAL source_t sources[INCLUDE_LEVEL_MAX];
/* For each include level, we define a source stream description. */

LOCAL short_t include_level = 0; /* current include level */

LOCAL source_t *source = NULL; /* points to <sources>[<include_level>-1] */

LOCAL string_t scanner_input = NULL;
/* If no file is included, the scanner reads its input from <scanner_input> */

LOCAL short_t next_char; /* the next char to be read */

/* functions ================================================================*/

LOCAL void read_next_char (void)
/* Read the next char from input into <next_char>.
 * If end of input stream is reached, return EOF.
 * If no input stream is selected, read input from <input_buffer>
 * If reading from stream, update column information. */
{ 
  if (source != NULL) 
  {
    next_char = getc (source->stream);
    
    if (next_char == EOF && ferror (source->stream))
      error ("can't read from \"%s\"", source->file_name);
    
    if (next_char == '\t')
      source->column = (source->column + 8) & ~7;
    else if (next_char == '\n') 
    {
      source->column = 0;
      source->line_number++;
    }
    else
      source->column++;
  }
  else if (scanner_input != NULL && *scanner_input != EOS)
    next_char = *scanner_input++;
  else 
  {
    scanner_input = NULL;
    next_char = EOF;
  }
}

/*---------------------------------------------------------------------------*/

LOCAL void read_next_char_again (void)
/* Like "read_next_char", but don't update column information. */
{ 
  if (source != NULL) 
  {
    next_char = getc (source->stream);
    
    if (next_char == EOF && ferror (source->stream)) 
      error ("can't read from \"%s\"", source->file_name);
  }
  else if (scanner_input != NULL && *scanner_input != EOS)
    next_char = *scanner_input++;
  else 
  {
    scanner_input = NULL;
    next_char = EOF;
  }
}

/*---------------------------------------------------------------------------*/

GLOBAL string_t current_file_name (void)
/* Return the name of the file reading from or NULL. */
{
  if (source == NULL)
    return NULL;
  else
    return source->file_name;
}

/*---------------------------------------------------------------------------*/

GLOBAL long_t current_file_name_index (void)
/* Return the name index of the file reading from or -1. */
{
  if (source == NULL)
    return -1;
  else
    return source->file_name_index;
}

/*---------------------------------------------------------------------------*/

GLOBAL long_t current_line_number (void)
/* Return the line number where the last char has been read or -1. */
{
  if (source == NULL)
    return -1;
  else
    return source->line_number;
}

/*---------------------------------------------------------------------------*/

GLOBAL long_t current_column (void)
/* Return the column where the last char has been read or -1. */
{
  if (source == NULL)
    return -1;
  else if (source->column == 0)
    return 0;
  else
    return source->column - 1; /* Let columns start with 0. */
}

/*---------------------------------------------------------------------------*/

GLOBAL void set_scanner_input (string_t input)
/* Let the scanner use <input> as scanner input.
 * <input> must remain valid until the scanner has done its work. */
{
  scanner_input = input;
  read_next_char ();
  read_next_token ();
}

/*---------------------------------------------------------------------------*/

GLOBAL void include_file (string_t file_name, long_t file_name_index)
/* Open a new level of inclusion and read tokens from <file_name>.
 * <file_name_index> is index of the current file name in the string pool. */
{
  FILE *stream;

  if (include_level >= INCLUDE_LEVEL_MAX)
    error ("too many nested includes");

  /* Next char of old source should be read later. */
  if (source != NULL)
    ungetc (next_char, source->stream);
  else if (scanner_input != NULL)
    scanner_input--;

  stream = fopen_save (file_name, "r");
  source = sources + include_level;
  include_level++;
  source->file_name = file_name;
  source->file_name_index = file_name_index;
  source->line_number = 1;
  source->column = 0;
  source->stream = stream;
  read_next_char ();
  read_next_token ();
}

/*---------------------------------------------------------------------------*/

GLOBAL void end_include (void)
/* Stop reading from current source stream and read from former file. */
{
  DB_ASSERT (include_level > 0);

  /* Close the file that we have included. */
  fclose_save (source->stream, source->file_name);

  include_level--;
  if (include_level > 0)
    source = sources + include_level - 1;
  else
    source = NULL;

  if (source != NULL || scanner_input != NULL) 
  {
    read_next_char_again ();
    read_next_token ();
  }
}

/*---------------------------------------------------------------------------*/

GLOBAL void stop_scanner (void)
/* Stop the scanner in case of an emergency. */
{
  long_t i;
  
  source = NULL;
  scanner_input = NULL;
  for (i = 0; i < include_level; i++)
    fclose (sources[i].stream);
  include_level = 0;
}

/*---------------------------------------------------------------------------*/

LOCAL void read_name (void)
/* Read rule name, variable, or keyword into <token_name>. */
{
  short_t index;

  index = 0;
  while (next_char != EOF &&
	 (next_char == '_' || next_char == '&' || next_char == '|' 
	  || IS_DIGIT (next_char) || IS_LETTER (next_char)))
  {
    if (index >= TOKEN_NAME_MAX - 1)
      error ("identifier too long");
    
    token_name[index++] = next_char;
    read_next_char ();
  }

  token_name[index] = EOS;
  if (index == NULL)
    error ("illegal character in name");
}

/*---------------------------------------------------------------------------*/

LOCAL short_t keyword (string_t name)
/* Look up <name> in the keyword table and return its token value.
 * If <name> is no keyword, return TOK_IDENT. */
{
  short_t lower = 0;
  short_t upper = NUMBER_OF_KEYWORDS - 1;

  /* We do a binary search on the keywords.
   * A keyword must be in the range of keywords[lower..upper]. */
  while (lower < upper) 
  {
    short_t middle = (lower + upper) / 2;
    byte_t comp_result = strcmp_no_case (name, keywords[middle].name);
    
    if (comp_result < 0)
      upper = middle - 1;
    else if (comp_result > 0)
      lower = middle + 1;
    else
      lower = upper = middle;
  }
  
  if (lower == upper && strcmp_no_case (name, keywords[lower].name) == 0)
    return keywords[lower].code;
  else
    return TOK_IDENT;
}

/*---------------------------------------------------------------------------*/

LOCAL void read_number (void)
/* Read a floating point number. Save its value in <token_number>. */
{
  token_number = 0.0;
  while (IS_DIGIT (next_char)) 
  {
    token_number = 10.0 * token_number + (next_char - '0');
    read_next_char ();
  }

  if (next_char == '.') 
  {
    double factor;
    
    read_next_char ();
    if (! IS_DIGIT (next_char)) 
      error ("missing digit after \".\"");
    
    factor = 1.0;
    while (IS_DIGIT (next_char)) 
    {
      factor *= 0.1;
      token_number += factor * (next_char - '0');
      read_next_char ();
    }
  }
  
  if (next_char == 'E' || next_char == 'e') /* Read an exponent. */
  {
    double multiplier;
    short_t exponent;
    
    read_next_char ();
    multiplier = (next_char == '-') ? 0.1 : 10.0;
    
    if (next_char == '-' || next_char == '+')
      read_next_char ();
    
    if (! IS_DIGIT (next_char)) 
      error ("missing exponent");
    
    /* Read exponent number. */
    exponent = 0;
    while (IS_DIGIT (next_char)) 
    {
      exponent = 10 * exponent + (next_char - '0');
      if (exponent > 300)
	error ("exponent too big");
      read_next_char ();
    }
    
    /* Multiply <token_number> by <multiplier> ^ <exponent>. */
    while (exponent > 0)
    {
      if (exponent & 1)
	token_number *= multiplier;
      multiplier = multiplier * multiplier;
      exponent = exponent / 2;
    }
  }
}

/*---------------------------------------------------------------------------*/

GLOBAL void read_next_token (void)
/* Read the next token from current source into <next_token>.
 * If end of input stream is reached, return EOF. */
{
  /* Read chars until a token has been recognised. */
  while (TRUE) 
  {
    switch (next_char) 
    {
    case EOF:
      next_token = EOF;
      return;
      
    case ' ':
    case '\t':
    case '\n': /* Read over whitespace. */
      read_next_char ();
      break;
      
    case '#': /* Read over a comment. */
      do 
      {
	read_next_char ();
      } while (next_char != '\n' && next_char != EOF);
      break;
      
    case '\"': /* Read a string. */
    {
      short_t index = 0;
      
      read_next_char (); /* overread beginning '"' */
      while (next_char != '\"') 
      {
	if (next_char == '\\') 
	{
	  /* Do NOT delete the '"', it's needed for patterns. */
	  if (index >= TOKEN_NAME_MAX - 1)
	    error ("string too long");
	  
	  token_name[index++] = next_char;
	  read_next_char ();
	}

	if (next_char == '\n' || next_char == EOF)
	  error ("unterminated string at end of line");
	
	if (index >= TOKEN_NAME_MAX - 1)
	  error ("string too long");
	
	token_name[index++] = next_char;
	read_next_char ();
      }
      
      read_next_char (); /* Read over terminating '"'. */
      token_name[index] = EOS;
      next_token = TOK_STRING;
      return;
    }

    case ':': /* Read a ":", ":=", ":=+", ":=-", ":=*", ":=/". */
      read_next_char ();
      if (next_char == '=')
      {
	read_next_char ();
	if (next_char == '+')
	{
	  next_token = TOK_ASSIGN_PLUS;
	  read_next_char ();
	}
	else if (next_char == '-')
	{
	  next_token = TOK_ASSIGN_MINUS;
	  read_next_char ();
	}
	else if (next_char == '*')
	{
	  next_token = TOK_ASSIGN_ASTERISK;
	  read_next_char ();
	}
	else if (next_char == '/')
	{
	  next_token = TOK_ASSIGN_SLASH;
	  read_next_char ();
	}
	else
	  next_token = TOK_ASSIGN;
      }
      else
	next_token = ':';
      return;
      
    case '/': /* Read a "/", a "/=" or a "/~". */
      read_next_char ();
      if (next_char == '=') 
      {
	next_token = TOK_NOT_EQUAL;
	read_next_char ();
      }
      else if (next_char == '~') 
      {
	next_token = TOK_NOT_CONGRUENT;
	read_next_char ();
      }
      else
	next_token = '/';
      return;
      
    case '-':
      read_next_char ();
      if (! IS_DIGIT (next_char))
	next_token = '-';
      else
      {
	read_number ();
	token_number = -token_number;
	next_token = TOK_NUMBER;
      }
      return;

    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
      /* Read a number. */
      read_number ();
      next_token = TOK_NUMBER;
      return;
      
    case '$':
      read_next_char ();
      read_name ();
      next_token = TOK_VARIABLE;
      return;
      
    case '@':
      read_next_char ();
      read_name ();
      next_token = TOK_CONSTANT;
      return;

    default: 
      if (IS_LETTER (next_char) || next_char == '_')
      {
	read_name ();
	next_token = keyword (token_name);
	return;
      } 
      else 
      { 
	next_token = next_char;
	read_next_char ();
	return;
      }
    }
  }
}

/*---------------------------------------------------------------------------*/

GLOBAL string_t token_as_text (short_t token)
/* Return <token> as a string readable for humans.
 * The string is only valid until this function is called again. */
{
  short_t i;
  static char string_buffer[20];
  string_t string_buffer_end = string_buffer + 20;
  string_t string;
  char token_buffer[2];

  string = string_buffer;

  /* Look if <token> is a keyword. */
  for (i = 0; i < NUMBER_OF_KEYWORDS; i++) 
  {
    if (keywords[i].code == token) 
    {
      string = copy_string (string, "\"", string_buffer_end);
      string = copy_string (string, keywords[i].name, string_buffer_end);
      string = copy_string (string, "\"", string_buffer_end);
      return string_buffer;
    }
  }
  
  switch (token) 
  {
  case EOF:                 return "end of file"; 
  case TOK_STRING:          return "string";
  case TOK_IDENT:           return "identifier";
  case TOK_VARIABLE:        return "variable";
  case TOK_CONSTANT:        return "constant";
  case TOK_NUMBER:          return "number";
  case TOK_ASSIGN:          return "\":=\"";
  case TOK_ASSIGN_PLUS:     return "\":=+\"";
  case TOK_ASSIGN_MINUS:    return "\":=-\"";
  case TOK_ASSIGN_ASTERISK: return "\":=*\"";
  case TOK_ASSIGN_SLASH:    return "\":=/\"";
  case TOK_NOT_EQUAL:       return "\"/=\"";
  case TOK_NOT_CONGRUENT:   return "\"/~\"";
  default:
    token_buffer[0] = token;
    token_buffer[1] = EOS;
    
    string = string_buffer;
    string = copy_string (string, "\"", string_buffer_end);
    string = copy_string_readable (string, token_buffer, string_buffer_end,
				   NULL);
    string = copy_string (string, "\"", string_buffer_end);
    return string_buffer;
  }
}

/*---------------------------------------------------------------------------*/

GLOBAL void test_token (short_t token)
/* Test if <token> is the next token. If it's not, report an error. */
{
  if (next_token != token) 
  {
    char next_token_as_text[20];
    
    copy_string (next_token_as_text, token_as_text (next_token), 
		 next_token_as_text + 20);
    error ("%s expected, not %s", token_as_text (token), next_token_as_text);
  }
}

/*---------------------------------------------------------------------------*/

GLOBAL void parse_token (short_t token)
/* Test if <token> is the next token and read next token. */
{
  test_token (token);
  read_next_token ();
}

/*---------------------------------------------------------------------------*/

GLOBAL void delete_escapes (string_t string)
/* Delete any "\" in <string> (except it is preceeded by a "\"). */
{
  string_t string2;

  string2 = string;
  while (*string != EOS) 
  {
    if (*string == '\\') 
    {
      string++;
      if (*string == EOS)
	break; /* This is for a "\" at the end of a string. */
    }
    *string2++ = *string++;
  }
  *string2 = EOS;
}

/*---------------------------------------------------------------------------*/
