/* This file is part of Malaga, a system for Natural Language Analysis.
 * Copyright (C) 1995-1999 Bjoern Beutel
 *
 * Bjoern Beutel
 * Universitaet Erlangen-Nuernberg
 * Abteilung fuer Computerlinguistik
 * Bismarckstrasse 12
 * D-91054 Erlangen
 * e-mail: malaga@linguistik.uni-erlangen.de 
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

/* description ==============================================================*/

/* This module contains function to compile and execute pattern matching 
 * strings. */

/* includes =================================================================*/

#include <ctype.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "basic.h"

#undef GLOBAL
#define GLOBAL

#include "patterns.h"

/* constants ================================================================*/

#define SPECIAL_CHARS ".[]-^()*?+|\\" /* characters with a special meaning */

#define PATTERN_STACK_MAX 50 /* maximum size of pattern stack */

/* These are the instructions for matching a pattern.
 *
 * A pattern is a 0-terminated sequence of CHARs, defined as follows:
 * C[] is the code vector.
 * PC means pattern counter (pointer to char following a PAT_ instruction).
 * S[] is the string to be examined.
 * I is the index into the string.
 * CS[] is the code stack and IS[] is the string index stack.
 * SP is the stack pointer (same for RS[] and IS[]).
 * VS[] and VE[] are vectors to store the index of start and end of vars. */
enum
{
  /* code 0 is EOS */
  PAT_JUMP = 1, /* PC += (byte_t) C[PC]; */      
  PAT_JUMP_NOW, /* SP++; CS[SP] = PC+1; IS[SP] = I; PC += (byte_t) C[PC]; */
  PAT_JUMP_LATER, /* SP++; CS[SP] = PC + (byte_t) C[PC]; IS[SP] = I; PC++; */
  PAT_MATCH_ANY, /* if S[I] != EOS
		  * then I++;
		  * else fail; */
  PAT_MATCH_CLASS, /* if (S[I] in {C[PC+1],..,C[PC+C[PC]})
		    * then I++; PC += C[PC]+1;
		    * else fail; */ 
  PAT_MATCH_NOT_CLASS, /* if (S[I] in {C[PC+1],..,C[PC+C[PC]})
		        * then fail;
		        * else I++; PC += C[PC]+1; */
  PAT_START_VAR_0, /* VS[0] = I; */
  PAT_START_VAR_1, /* VS[1] = I; */
  PAT_START_VAR_2, /* VS[2] = I; */
  PAT_START_VAR_3, /* VS[3] = I; */
  PAT_START_VAR_4, /* VS[4] = I; */
  PAT_END_VAR_0, /* VE[0] = I; */
  PAT_END_VAR_1, /* VE[1] = I; */
  PAT_END_VAR_2, /* VE[2] = I; */
  PAT_END_VAR_3, /* VE[3] = I; */
  PAT_END_VAR_4 /* VE[4] = I; */
};

/* forwards =================================================================*/

FORWARD void local_compile_pattern (text_t text, 
				    string_t *string_ptr, 
				    bool_t *may_be_empty);

/* functions ================================================================*/

LOCAL bool_t is_pattern_char (string_t s)
{
  return ((s[0] == '\\' && s[1] != EOS && strchr (SPECIAL_CHARS, s[1]) != NULL)
	  || (IS_PRINT (s[0]) && strchr (SPECIAL_CHARS, s[0]) == NULL));
}

/*---------------------------------------------------------------------------*/

LOCAL char pattern_char (string_t *string_ptr)
/* See if *<string_ptr> points to a valid char or to an escape sequence.
 * Return the character code. Show an error if not valid. */
{
  string_t s = *string_ptr;
  char c;
  
  if (s[0] == '\\' && s[1] != EOS && strchr (SPECIAL_CHARS, s[1]) != NULL)
  {
    c = TO_LOWER (s[1]);
    s += 2;
  }
  else if (IS_PRINT (s[0]) && strchr (SPECIAL_CHARS, s[0]) == NULL)
  {
    c = TO_LOWER (s[0]);
    s++;
  }
  else if (s[0] == EOS)
    error ("unexpected end of pattern");
  else
    error ("invalid char \"%c\" in pattern", s[0]);
  
  *string_ptr = s;
  return c;
}

/*---------------------------------------------------------------------------*/

LOCAL char offset (int_t offset)
/* Return <offset> as a char. */
{
  /* See if offset can be stored in a char. */
  if ((char) offset != offset || offset == 0)
    error ("pattern too complex");

  return (char) offset;
}

/*---------------------------------------------------------------------------*/

LOCAL void compile_char_class (text_t text, string_t *string_ptr)
/* Compile a character class at <*string_ptr>.
 * Save the resulting pattern in <text>. */
{
  string_t s = *string_ptr;
  int_t patch_index;

  s++;
  if (*s == '^') 
  {
    s++;
    add_char_to_text (text, PAT_MATCH_NOT_CLASS);
  } 
  else
    add_char_to_text (text, PAT_MATCH_CLASS);

  patch_index = text_length (text);

  do /* Read chars and ranges. */
  {
    int_t ca, ce, c;

    if (*s == EOS)
      error ("missing \"]\" in pattern");
    
    ca = ORD (pattern_char (&s));
    if (*s == '-') 
    {
      s++;
      ce = ORD (pattern_char (&s));
      if (ca > ce)
	error ("invalid range \"%c-%c\" in pattern", (char) ca, (char) ce);
    }
    else 
      ce = ca;

    for (c = ca; c <= ce; c++)
      add_char_to_text (text, (char) c);

  } while (*s != ']');
  
  *string_ptr = ++s;
  insert_char_in_text (text, offset (text_length (text) - patch_index), 
		       patch_index); 
}

/*---------------------------------------------------------------------------*/

LOCAL void compile_atom (text_t text, 
			 string_t *string_ptr, 
			 bool_t *may_be_empty)
/* Compile an atom at <*string_ptr>. 
 * Save the resulting pattern in <text>. 
 * <may_be_empty> == TRUE iff the atom may match an empty string. */
{
  string_t s = *string_ptr;
  int_t start, length;
  bool_t local_may_be_empty;

  *may_be_empty = TRUE;
  while (TRUE)
  {
    local_may_be_empty = FALSE;
    start = text_length (text);

    if (*s == '[') 
      compile_char_class (text, &s);
    else if (*s == '.') 
    {
      s++;
      add_char_to_text (text, PAT_MATCH_ANY);
    } 
    else if (*s == '(')
    {
      s++;
      local_compile_pattern (text, &s, &local_may_be_empty);
      if (*s++ != ')')
	error ("missing \")\" in pattern");
    }   
    else if (is_pattern_char (s))
      add_char_to_text (text, pattern_char (&s)); 
    else
      break;
    
    length = text_length (text) - start;

    /* There may be a postfix operator following parentheses. */
    if (*s == '?')
    {
      s++;
      insert_char_in_text (text, PAT_JUMP_NOW, start);
      insert_char_in_text (text, offset (2 + length), start + 1);
    }
    else if (*s == '*') 
    {
      s++;
      if (local_may_be_empty)
	error ("pattern argument for \"*\" may be empty");
      insert_char_in_text (text, PAT_JUMP, start);
      insert_char_in_text (text, offset (2 + length), start + 1);
      add_char_to_text (text, PAT_JUMP_LATER);
      add_char_to_text (text, offset (- length));
    }
    else if (*s == '+') 
    {
      s++;
      if (local_may_be_empty)
	error ("pattern argument for \"+\" may be empty");
      add_char_to_text (text, PAT_JUMP_LATER);
      add_char_to_text (text, offset (- length));
      *may_be_empty = FALSE;
    }
    else
      *may_be_empty &= local_may_be_empty;
  }
  
  *string_ptr = s;
}

/*---------------------------------------------------------------------------*/

LOCAL void local_compile_pattern (text_t text, 
				  string_t *string_ptr, 
				  bool_t *may_be_empty)
/* Convert <string> to a pattern to be used as input to "match_pattern".
 * If <pattern_var_no> != -1, mark the pattern so the string matching this
 * pattern will be stored in <pattern_var[pattern_var_no]>.
 * The result pattern must be freed with "free" after usage. */
{
  string_t s = *string_ptr;
  int_t alternative_length, alternative_start;
  bool_t local_may_be_empty;

  alternative_start = text_length (text);
  compile_atom (text, &s, may_be_empty);
  alternative_length = text_length (text) - alternative_start;

  while (*s == '|') 
  {
    s++;

    /* Add jump from start of last alternative to start of this alternative. */
    insert_char_in_text (text, PAT_JUMP_LATER, alternative_start++);
    insert_char_in_text (text, alternative_length + 4, alternative_start++);

    alternative_start = text_length (text);
    compile_atom (text, &s, &local_may_be_empty);
    alternative_length = text_length (text) - alternative_start;
    
    *may_be_empty |= local_may_be_empty;

    /* Add from end of last alternative to end of this alternative. */
    insert_char_in_text (text, PAT_JUMP, alternative_start++);
    if (*s == '|')
      insert_char_in_text (text, alternative_length + 4, alternative_start++);
    else
      insert_char_in_text (text, alternative_length + 2, alternative_start++);
  }

  *string_ptr = s;
}

/*---------------------------------------------------------------------------*/

GLOBAL string_t compile_pattern (string_t string, int_t pattern_var_no)
/* Convert <string> to a pattern to be used as input to "match_pattern".
 * If <pattern_var_no> != -1, mark the pattern so the string matching this
 * pattern will be stored in <pattern_var[pattern_var_no]>.
 * The result pattern must be freed with "free" after usage. */
{
  text_t text = new_text ();
  bool_t may_be_empty;

  if (pattern_var_no != -1)
    add_char_to_text (text, PAT_START_VAR_0 + pattern_var_no);

  local_compile_pattern (text, &string, &may_be_empty);
  if (*string != EOS)
    error ("illegal char \"%c\" in pattern", *string);

  if (pattern_var_no != -1)
    add_char_to_text (text, PAT_END_VAR_0 + pattern_var_no);

  return text_to_string (&text);
}

/*---------------------------------------------------------------------------*/

GLOBAL bool_t match_pattern (string_t string, string_t pattern)
/* Test whether <string> matches <pattern> (a string of chars compiled with
 * "compile_pattern") and set substrings in <pattern_var>.
 * The substrings can be freed with "free_mem". */
{
  struct {string_t string, pattern;} stack[PATTERN_STACK_MAX]; /* backtrack */
  struct {string_t start; string_t end;} var[PATTERN_VAR_MAX];
  int_t sp, i;
  bool_t found_mismatch;

  sp = 0;
  found_mismatch = FALSE;

  for (i = 0; i < PATTERN_VAR_MAX; i++)
    var[i].start = var[i].end = NULL;

  while (! found_mismatch) 
  {
    char code = *pattern++;
    
    switch (code) 
    {
    case EOS:
      if (*string == EOS)
      {
	for (i = 0; i < PATTERN_VAR_MAX; i++)
	{
	  if (var[i].start != NULL && var[i].end != NULL)
	  {
	    free_mem (pattern_var + i);
	    pattern_var[i] = new_string (var[i].start, var[i].end);
	  }
	}
	return TRUE;
      }
      else
	found_mismatch = TRUE;
      break;

    case PAT_JUMP:
      pattern += (byte_t) *pattern - 1;
      break;
      
    case PAT_JUMP_NOW:
      if (sp == PATTERN_STACK_MAX)
	error ("match pattern is too complex");
      
      stack[sp].string = string;
      stack[sp].pattern = pattern + 1;
      sp++;
      pattern += (byte_t) *pattern - 1;
      break;
      
    case PAT_JUMP_LATER:
      if (sp == PATTERN_STACK_MAX)
	error ("match pattern is too complex");
      
      stack[sp].string = string;
      stack[sp].pattern = pattern + (byte_t) *pattern - 1;
      sp++;
      pattern++;
      break;
      
    case PAT_MATCH_ANY:
      if (*string++ == EOS)
	found_mismatch = TRUE;
      break;
      
    case PAT_MATCH_CLASS:
      if (*string == EOS)
	found_mismatch = TRUE;
      else 
      {
	string_t index;
	
	index = pattern + 1;
	pattern += (byte_t) *pattern + 1;
	while (index < pattern && TO_LOWER (*string) != *index)
	  index++;
	string++;
	
	if (index >= pattern)
	  found_mismatch = TRUE;
      }
      break;
      
    case PAT_MATCH_NOT_CLASS:
      if (*string == EOS)
	found_mismatch = TRUE;
      else
      {
	string_t index;
	
	index = pattern + 1;
	pattern += (byte_t) *pattern + 1;
	while (index < pattern && TO_LOWER (*string) != *index)
	  index++;
	string++;
	
	if (index < pattern)
	  found_mismatch = TRUE;
      }
      break;

    case PAT_START_VAR_0:
    case PAT_START_VAR_1:
    case PAT_START_VAR_2:
    case PAT_START_VAR_3:
    case PAT_START_VAR_4:
      var[code - PAT_START_VAR_0].start = string;
      break;
      
    case PAT_END_VAR_0:
    case PAT_END_VAR_1:
    case PAT_END_VAR_2:
    case PAT_END_VAR_3:
    case PAT_END_VAR_4:
      var[code - PAT_END_VAR_0].end = string;
      break;
      
    default:
      if (code != TO_LOWER (*string))
	found_mismatch = TRUE;
      string++;
      break;
    }
    
    /* If this path was not successful and there is another path, try it. */
    if (found_mismatch && sp > 0) 
    {
      sp--;
      string = stack[sp].string;
      pattern = stack[sp].pattern;
      found_mismatch = FALSE;
    }
  }
  
  return FALSE;
}

/* end of file ==============================================================*/
