/* ,file-id archive://[lord]/435/rx/node.c/1998-05-18
 */

/*	Copyright (C) 1997 Tom Lord
 * 
 * This program is provided to you under the terms of the Liberty Software
 * License.  You are NOT permitted to redistribute, modify, or use it
 * except in very specific ways described by that license.
 *
 * This software comes with NO WARRANTY.
 * 
 * You should have received a copy of the Liberty Software License
 * along with this software; see the file =LICENSE.  If not, write to
 * the Tom Lord, 1810 Francisco St. #2, Berkeley CA, 94703, USA.  
 */






#include "vu/bitset.h"
#include "vu/dstr.h"
#include "vu/str.h"
#include "vu/xmalloc.h"
#include "rexp.h"


/****************************************************************
 * Allocation and initialization.
 */

struct rx_exp_node *
rx_exp_node (int type)
{
  struct rx_exp_node *n;

  n = (struct rx_exp_node *) xmalloc (sizeof (*n));
  memset0 ((char *)n, sizeof (*n));
  n->type = type;
  n->refs = 1;
  return n;
}

struct rx_exp_node *
rx_mk_r_cset (int type, int size, bitset b)
{
  struct rx_exp_node * n;

  n = rx_exp_node (type);
  n->cset = bitset_dup (size, b);
  n->cset_size = size;
  return n;
}

struct rx_exp_node *
rx_mk_r_binop (int type,
	       struct rx_exp_node * a,
	       struct rx_exp_node * b)
{
  struct rx_exp_node * n;
  n = rx_exp_node (type);
  n->left = a;
  n->right = b;
  return n;
}

struct rx_exp_node *
rx_mk_r_monop (int type, struct rx_exp_node * a)
{
  return rx_mk_r_binop (type, a, 0);
}

struct rx_exp_node *
rx_mk_r_str_c (int type, char c)
{
  struct rx_exp_node *n;
  n = rx_exp_node (type);
  init_dstr (&(n->cstr), &c, 1);
  return n;
}

struct rx_exp_node *
rx_mk_r_str (int type, char * s, int len)
{
  struct rx_exp_node *n;
  n = rx_exp_node (type);
  init_dstr (&(n->cstr), s, len);
  return n;
}

struct rx_exp_node *
rx_mk_r_int (int type, int intval)
{
  struct rx_exp_node * n;
  n = rx_exp_node (type);
  n->intval = intval;
  return n;
}

struct rx_exp_node *
rx_mk_r_int2 (int type, int intval, int intval2)
{
  struct rx_exp_node * n;
  n = rx_exp_node (type);
  n->intval = intval;
  n->intval2 = intval2;
  return n;
}


/****************************************************************
 * Reference counting and copying.
 */

void
rx_save_rexp (struct rx_exp_node * node)
{
  if (node)
    ++node->refs;
}


void
rx_free_rexp (struct rx_exp_node * node)
{
  if (node && !--node->refs)
    {
      if (node->cset)
	xfree (node->cset);
      if (node->cstr.chr)
	free_dstr_static (&(node->cstr));
      rx_free_rexp (node->left);
      rx_free_rexp (node->right);
      rx_free_rexp (node->simplified);
      xfree ((char *)node);
    }
}

struct rx_exp_node * 
rx_copy_rexp (int cset_size, struct rx_exp_node *node)
{
  if (!node)
    return 0;
  else
    {
      struct rx_exp_node *n;
      n = rx_exp_node (node->type);

      if (node->cset)
	{
	  n->cset = bitset_dup (cset_size, node->cset);
	  n->cset_size = node->cset_size;
	}

      if (node->cstr.chr)
	init_dstr (&(n->cstr),
		   node->cstr.chr,
		   node->cstr.len);
      n->intval = node->intval;
      n->intval2 = node->intval2;
      n->left = rx_copy_rexp (cset_size, node->left);
      n->right = rx_copy_rexp (cset_size, node->right);
      n->len = node->len;
      n->observed = node->observed;
      return n;
    }
}

struct rx_exp_node * 
rx_shallow_copy_rexp (int cset_size, struct rx_exp_node *node)
{
  if (!node)
    return 0;
  else
    {
      struct rx_exp_node *n;
      n = rx_exp_node (node->type);
      if (node->cset)
	n->cset = bitset_dup (cset_size, node->cset);

      if (node->cstr.chr)
	init_dstr (&(n->cstr),
		   node->cstr.chr,
		   node->cstr.len);

      n->intval = node->intval;
      n->intval2 = node->intval2;
      n->left = node->left;
      rx_save_rexp (node->left);
      n->right = node->right;
      rx_save_rexp (node->right);
      n->len = node->len;
      n->observed = node->observed;
      return n;
    }
}


/****************************************************************
 * Hashing and Equality
 *
 */

int
rx_exp_equal (struct rx_exp_node * a, struct rx_exp_node * b)
{
  int ret;

  if (a == b)
    return 1;

  if ((a == 0) || (b == 0))
    return 0;

  if (a->type != b->type)
    return 0;

  switch (a->type)
    {
    case r_cset:
      ret = (   (a->cset_size == b->cset_size)
	     && bitset_is_equal (a->cset_size,
				    a->cset,
				    b->cset));
      break;

    case r_string:
      ret = (   (a->cstr.len == b->cstr.len)
	     && !strncmp (a->cstr.chr, b->cstr.chr, a->cstr.len));
      break;

    case r_cut:
      ret = (a->intval == b->intval);
      break;

    case r_concat:
    case r_alternate:
      ret = (   rx_exp_equal (a->left, b->left)
	     && rx_exp_equal (a->right, b->right));
      break;
    case r_star:
      ret = rx_exp_equal (a->left, b->left);
      break;
    case r_interval:
      ret = (   (a->intval == b->intval)
	     && (a->intval2 == b->intval2)
	     && rx_exp_equal (a->left, b->left));
      break;
    case r_parens:
      ret = (   (a->intval == b->intval)
	     && rx_exp_equal (a->left, b->left));
      break;

    case r_context:
      ret = (a->intval == b->intval);
      break;
    default:
      return 0;
    }
  return ret;
}

static unsigned long
exp_hash (struct rx_exp_node * node, unsigned long seed)
{
  unsigned long contribution;

  if (!node)
    return seed;

  /* This is just made up and should be checked out. */

  contribution = (  node->type
		  ^ strnhash (node->cstr.chr, node->cstr.len)
		  ^ ((seed << 3) + node->intval)
		  ^ ((seed << 3) + node->intval2));

  seed = contribution ^ (seed << 11) ^ (seed >> (8 * sizeof (seed) - 11));
  seed = exp_hash (node->left, seed);
  seed = exp_hash (node->right, seed);
  return seed;
}

unsigned long
rx_exp_hash (struct rx_exp_node * node)
{
  return exp_hash (node, 0);
}



/****************************************************************
 * rx_analyze_rexp
 *
 * The length of a string matched by a regexp may be fixed or
 * variable.  If it is fixed, that fact can be computed staticly
 * and is useful for certain regexp matcher optimizations (quickly
 * discarding candidate matches of an impossible length).
 *
 * A regexp may be a regular expression, or an expression of a more
 * general type.  It is useful to know for each subexpression of
 * a non-regular expression, whether or not the subexpression is 
 * itself a regular expression.   It is useful to know if an overall
 * expression is regular or non-regular.  This information is used
 * to optimize regexp matching.
 *
 * rx_analyze_rexp computes those two properties.  Each node
 * has two corresponding fields, which rx_analyze_rexp sets:
 *
 * LEN :  	Set the fixed length of a string matching the
 *		expression, or -1 if the expression matches
 *		strings of more than one length.
 *
 * OBSERVED : 	Set to 0 if the expression is a regular expression,
 *		set to 1 otherwise.
 * 
 * Additionally, this function builds an array of pointers to parenthesized
 * subexpressions: this array is useful as a parameter to rx_simplify_rexp.
 */

void
rx_analyze_rexp (struct rx_exp_node *** subexps,
		 int * re_nsub,
		 struct rx_exp_node * node)
{
  if (node)
    {
      int this_subexp;
      if (node->type == r_parens)
	{
	  if (node->intval >= 0)
	    {
	      this_subexp = *re_nsub;
	      ++*re_nsub;
	      if (!*subexps)
		*subexps = ((struct rx_exp_node **)
			    xmalloc (sizeof (struct rx_exp_node *) * *re_nsub));
	      else
		*subexps = ((struct rx_exp_node **)
			    realloc (*subexps,
				     sizeof (struct rx_exp_node *) * *re_nsub));
	    }
	}

      if (node->left)
	rx_analyze_rexp (subexps, re_nsub, node->left);

      if (node->right)
	rx_analyze_rexp (subexps, re_nsub, node->right);

      switch (node->type)
	{
	case r_cset:
	  node->len = 1;
	  node->observed = 0;
	  break;
 	case r_string:
 	  node->len = node->cstr.len;
 	  node->observed = 0;
 	  break;
	case r_cut:
	  node->len = 0;
	  node->observed = 0;
	  break;
	case r_concat:
	case r_alternate:
	  {
	    int lob, rob;
	    int llen, rlen;
	    lob = (!node->left ? 0 : node->left->observed);
	    rob = (!node->right ? 0 : node->right->observed);
	    llen = (!node->left ? 0 : node->left->len);
	    rlen = (!node->right ? 0 : node->right->len);
	    node->len = ((llen >= 0) && (rlen >= 0)
			 ? ((node->type == r_concat)
			    ? llen + rlen
			    : ((llen == rlen) ? llen : -1))
			 : -1);
	    node->observed = lob || rob;
	    break;
	  }
	case r_star:
	  node->len = -1;
	  node->observed = (node->left
			    ? node->left->observed
			    : 0);
	  break;

	case  r_interval:
	  node->len = -1;
	  node->observed = 1;
	  break;

	case r_parens:
	  if (node->intval >= 0)
	    {
	      node->observed = 1;
	      (*subexps)[this_subexp] = node;
	    }
	  else
	    node->observed = (node->left
			      ? node->left->observed
			      : 0);
	  node->len = (node->left
		       ? node->left->len
		       : 0);
	  break;

	case r_context:
	  switch (node->intval)
	    {
	    default:
	      node->observed = 1;
	      node->len = -1;
	      break;
	    case '^':
	    case '$':
	      node->observed = 1;
	      node->len = 0;
	      break;
	    }
	  break;

	}

      return;
    }
  return;
}


/****************************************************************
 * rx_simplify_rexp
 *
 * Convert an expression which may not be a regular expression
 * into a regular expression matching a superset of that pattern.
 *
 * This is useful for a matching heuristic: the regular superset
 * language is used to find a candidate string for a match, the 
 * original irregular expression is used to verify the match.
 *
 * If the input expression is a regular expression, this is the
 * identity function.
 *
 * "*answer" is a return parameter.
 *
 * "subexps" is an array of pointers into the expression "node".
 *	     Element N of the array is the Nth parenthesized
 *	     subexpression of "node".  This array is usually
 *	     computed by rx_analyze_rexp.
 */

void
rx_simplify_rexp (struct rx_exp_node ** answer,
		  int cset_size,
		  struct rx_exp_node *node,
		  struct rx_exp_node ** subexps)
{
  if (!node)
    {
      *answer = 0;
      return;
    }

  if (!node->observed)
    {
      rx_save_rexp (node);
      *answer = node;
      return;
    }

  if (node->simplified)
    {
      rx_save_rexp (node->simplified);
      *answer = node->simplified;
      return;
    }

  switch (node->type)
    {
    default:
    case r_cset:
    case r_string:
    case r_cut:
      panic ("bogus regexp in rx_simplify_rexp");
      return;

    case r_parens:
      rx_simplify_rexp (answer, cset_size, node->left, subexps);
      break;

    case r_context:
      if (isdigit (node->intval))
	rx_simplify_rexp (answer, cset_size, subexps [node->intval - '0'], subexps);
      else
	*answer = 0;
      break;

    case r_concat:
    case r_alternate:
    case r_star:
    case r_interval:
      {
	struct rx_exp_node *n;

	n = rx_exp_node (node->type);

	if (node->cset)
	  n->cset = bitset_dup (cset_size, node->cset);

	n->intval = node->intval;
	n->intval2 = node->intval2;
	rx_simplify_rexp (&n->left, cset_size, node->left, subexps);
	rx_simplify_rexp (&n->right, cset_size, node->right, subexps);
	*answer = n;
      }
      break;
    }
  
  node->simplified = *answer;
  rx_save_rexp (node->simplified);
}



/****************************************************************
 * rx_is_anchored_p
 *
 * Is an expression "anchored"?
 *
 * The expression "^" is "anchored" which means that the subexpression 
 * can match only at the beggining of a string.  No other simple kind
 * of expression shares this property.  It is a synthetic property of
 * composite expressions whether or not they are anchored.
 *
 * Knowing whether an expression is anchored or not is useful for optimizing
 * some common kinds of regexp search, so this function computes that property.
 *
 */

int
rx_is_anchored_p (struct rx_exp_node * exp)
{
  if (!exp)
    return 0;

  switch (exp->type)
    {
    default:
    case r_star:
    case r_cset:
    case r_string:
    case r_cut:
      return 0;

    case r_parens:
    case r_concat:
      return rx_is_anchored_p (exp->left);

    case r_alternate:
      return (   rx_is_anchored_p (exp->left)
	      && rx_is_anchored_p (exp->right));


    case r_interval:
      if (exp->intval == 0)
	return 0;
      else
	return rx_is_anchored_p (exp->left);
      
    case r_context:
      return (exp->intval == '^');
    }
}




/****************************************************************
 * rx_fill_in_fastmap
 *
 * If a pattern can not match the empty string, then there is
 * a set of characters (the "fastmap") from which the first character 
 * of a matching string must come.  For some patterns, the fastmap is 
 * smaller than the complete character set and is easy to compute.  
 * Knowing the fastmap is useful for optimizing some kinds of
 * regexp search.
 *
 * This function returns a set represented as an array of 256 bytes,
 * with entries for members of the set equal to 1, and other entries 
 * equal to 0.
 *
 */

int
rx_fill_in_fastmap (int cset_size, unsigned char * map, struct rx_exp_node * exp)
{
  if (!exp)
    {
    can_match_empty:
      {
	int x;
	for (x = 0; x < cset_size; ++x)
	  map[x] = 1;
      }
      return 1;
    }
  
  switch (exp->type)
    {
    case r_cset:
      {
	int x;
	int most;
	
	most = exp->cset_size;
	for (x = 0; x < most; ++x)
	  if (bitset_member (exp->cset, x))
	    map[x] = 1;
      }
      return 0;

    case r_string:
      if (exp->cstr.len)
 	{
	  map[exp->cstr.chr[0]] = 1;
	  return 0;
 	}
      else
	return 1;

    case r_concat:
      return (   rx_fill_in_fastmap (cset_size, map, exp->left)
	      && rx_fill_in_fastmap (cset_size, map, exp->right));

    case r_alternate:
      return (   rx_fill_in_fastmap (cset_size, map, exp->left)
	      || rx_fill_in_fastmap (cset_size, map, exp->right));

    case r_parens:
      return rx_fill_in_fastmap (cset_size, map, exp->left);

    case r_star:
      goto can_match_empty;

    case r_interval:
      if (exp->intval == 0)
	goto can_match_empty;
      else
	return rx_fill_in_fastmap (cset_size, map, exp->left);
      
    case r_cut:
      goto can_match_empty;
      
    case r_context:
      goto can_match_empty;

    default:
      return panic ("bogus regexp in rx_fill_in_fastmap");
    }
}


