/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#include <signal.h>
#include <stdio.h>
#include <errno.h>
#include <sys/types.h>
#include <string.h>
#include <fcntl.h>
#ifdef HAVE_DMALLOC_H
#include <dmalloc.h>
#endif /* HAVE_DMALLOC_H */

#include <khash.h>
#include <webbase.h>
#include <salloc.h>
#include <getopttools.h>
#include <webbase_url.h>
#include <sqlutil.h>
#include <md5str.h>
#include <server.h>

static int verbose = 0;

static webbase_url_start_t* webbase_decode_start(webbase_t* base, MYSQL_RES *res, MYSQL_ROW row, webbase_url_start_t* start);
static webbase_url_t* webbase_decode_url(webbase_t* base, MYSQL_RES *res, MYSQL_ROW row, webbase_url_t* webbase_url);
static webbase_url_start_t* webbase_get_start_1(webbase_t* base, char* quote, char* field, unsigned char* value, int value_length, webbase_url_start_t* start);
static webbase_t* webbase_alloc_1();

static struct option long_options[] =
{
  /* These options set a flag. */
  {"verbose_webbase", 0, &verbose, 1},
  {"base", 1, 0, 0},
  {"user", 1, 0, 0},
  {"password", 1, 0, 0},
  {"port", 1, 0, 0},
  {"host", 1, 0, 0},
  {"socket", 1, 0, 0},
  {"datadir", 1, 0, 0},
  {0, 0, 0, WEBBASE_OPTIONS}
};

static struct option_help long_options_help[] =
{
  /* These options set a flag. */
  {"verbose_webbase", "database manipulation related messages."},
  {"base <base name>", "name of the mysql database to use for the Meta data information."},
  {"user <name>", "name of the user to connect to database."},
  {"password <password>", "password of the -user to connect to database."},
  {"port <port>", "TCP/IP port to connect to database, if not default."},
  {"host <hostname>", "hostname of the MySQL database."},
  {"socket <file>", "Unix socket file full path name for local database connection."},
  {"datadir", "absolute path name of the directory containing the database."},
  {"0", ""}
};

struct option* webbase_options(struct option [])
{
  return long_options;
}

struct option_help* webbase_help_options(struct option_help [])
{
  return long_options_help;
}

webbase_t* webbase_alloc(int argc, char** argv, struct option options[])
{
  webbase_t* base = webbase_alloc_1();
  char* user = strdup("");
  char* password = strdup("");

  opterr = 0;
  optind = 0;
  while(1) {
    /* `getopt_long' stores the option index here. */
    int option_index = 0;
    int c;
    int found = 1;

    c = getopt_long_only(argc, argv, "-", options, &option_index);

    /* Detect the end of the options. */
    if (c == -1)
      break;
     
    switch (c)
      {
      case 0:
	/* If this option set a flag, do nothing else now. */
	
	if (options[option_index].flag != 0)
	  break;
	if(!strcmp(options[option_index].name, "base")) {
	  if(strlen(optarg) > WEBBASE_NAME_LENGTH) {
	    fprintf(stderr, "webbase: base name %s too long (max %d)\n", optarg, WEBBASE_NAME_LENGTH);
	    exit(1);
	  }
	  strcpy(base->name, optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "host")) {
	  if(strlen(optarg) > WEBBASE_HOST_LENGTH) {
	    fprintf(stderr, "webbase: host name %s too long (max %d)\n", optarg, WEBBASE_HOST_LENGTH);
	    exit(1);
	  }
	  strcpy(base->mysql_host, optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "user")) {
	  free(user);
	  user = strdup(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "password")) {
	  free(password);
	  password = strdup(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "datadir")) {
	  base->dir = strdup(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "socket")) {
	  base->mysql_unix_port = strdup(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "port")) {
	  base->mysql_port = atoi(optarg);
	  break;
	}
	found = 0;
	break;
      default:
	found = 0;
	break;
      }
    if(found) {
      hash_alloc_insert(base->options, (char*)options[option_index].name, strdup(optarg ? optarg : " "));
    }
  }

  if(base->dir == 0) base->dir = strdup(".");

  /*
   * Build path names
   */
  {
    if(strlen(base->dir) + WEBBASE_NAME_LENGTH + 1 > WEBBASE_PATH_LENGTH) {
      fprintf(stderr, "webbase: path too long, max %d\n", WEBBASE_PATH_LENGTH);
      exit(1);
    }
    sprintf(base->path, "%s/%s", base->dir, base->name);
  }

  /*
   * Open database
   */
  {
    if(base->mysql_host[0] == '\0') strcpy(base->mysql_host, "localhost");

    /*
     * Initialize structure and set defaults
     */
    mysql_init(&base->mysql);
    /*
     * Instruct real_connect to read missing parameters from ~/.my.cnf file.
     */
    mysql_options(&base->mysql, MYSQL_READ_DEFAULT_FILE, "my");
    
    if(!(mysql_real_connect(&base->mysql,
			    base->mysql_host[0] != '\0' ? base->mysql_host : (char*)0,
			    user[0] != '\0' ? user : (char*)0,
			    password[0] != '\0' ? password : (char*)0,
			    (char*)0,
			    base->mysql_port,
			    base->mysql_unix_port,
			    0))) {
      fprintf(stderr, "webbase_alloc: connect: %s\n", mysql_error(&base->mysql));
      fprintf(stderr, "webbase_alloc: connect: failed to connect\n");
      exit(1);
    }
  }
  
  if(mysql_select_db(&base->mysql, base->name)) {
    fprintf(stderr, "select: %s\n", mysql_error(&base->mysql));
    exit(1);
  }

  webbase_default_start(base, &base->default_start);

  base->pid = getpid();
  base->lock_wait = 60;
  if(getenv("WEBBASE_LOCK_WAIT")) {
    base->lock_wait = atoi(getenv("WEBBASE_LOCK_WAIT"));
  }
  base->lock_max_loop = 200;
  if(getenv("WEBBASE_LOCK_MAX_LOOP")) {
    base->lock_max_loop = atoi(getenv("WEBBASE_LOCK_MAX_LOOP"));
  }

  if(verbose > 1) fprintf(stderr, "webbase_alloc: finished\n");

  free(user);
  free(password);

  return base;
}

void webbase_free(webbase_t* base)
{
  base->servers = 0; /* We do not created this object, we do not delete it. */
  _K(hash_free)(base->options);
  mysql_close(&base->mysql);
  if(base->mysql_unix_port) free(base->mysql_unix_port);
  if(base->dir) free(base->dir);
  free(base);
}

void webbase_lock_ignore(webbase_t*, int varint, char* varchar)
{
  static char name[512];

  if(varchar) {
    sprintf(name, "WEBBASE_IGNORE=%d%s", varint, varchar);
    if(verbose > 1) fprintf(stderr, "\tset lock_ignored \n");
  } else {
    strcpy(name, "WEBBASE_IGNORE=");
    if(verbose > 1) fprintf(stderr, "\tunset lock_ignored \n");
  }

  if(putenv(name) != 0) {
    fprintf(stderr, "webbase_lock_ignore: unable to putenv(%s)\n", name);
    perror("");
    exit(1);
  }
}

/*
 * Insert a lock in the lock table. The name of the lock is
 * either a concatenation of varing and varchar argument (name), if
 * their cumulated size is lower than MD5_ASCII_SIZE or the
 * MD5 key calculated on (name).
 */
int webbase_lock(webbase_t* base, int varint, char* varchar)
{
  char query[1024];
  int locked = 0;
  int loop_count = 0;
  char name[512];
  int name_length;
  unsigned char* md5;
  int md5_length;

  sprintf(name, "%d%s", varint, varchar);
  name_length = strlen(name);
  
  {
    char* ignore = getenv("WEBBASE_IGNORE");
    if(ignore && !strcmp(name, ignore)) {
      if(verbose) fprintf(stderr, "\tlock ignored %s\n", name);
      return 1;
    }
  }

  if(name_length <= MD5_ASCII_SIZE) {
    md5 = (unsigned char*)name;
    md5_length = name_length;
  } else {
    md5 = str2md5ascii_simple(name, strlen(name));
    md5_length = MD5_ASCII_SIZE;
  }

  sprintf(query, "insert low_priority into locks values ('%.*s', '%d')", md5_length, md5, base->pid);

  if(verbose > 1) fprintf(stderr, "\tlocking %s ...", name);
  while(locked == 0) {
    loop_count++;
    if(loop_count > base->lock_max_loop) {
      fprintf(stderr, "webbase_lock: looped more than %d time waiting for %s\n", base->lock_max_loop, name);
      exit(1);
    }
    if(mysql_query(&base->mysql, query)) {
      char query2[1024];
      char pid_string[32];
      if(mysql_errno(&base->mysql) != ER_DUP_ENTRY) {
	fprintf(stderr, "%s: %s\n", query, mysql_error(&base->mysql));
	exit(1);
      }
      if(verbose) fprintf(stderr, " check pid for %s ", name);
      sprintf(query2, "select high_priority pid from locks where name = '%.*s'", md5_length, md5);
      if(sql_select_value(&base->mysql, query2, pid_string, WEBBASE_INTEGER_VALUE_SIZE) == 0) {
	/*
	 * Lock is gone
	 */
	if(verbose) fprintf(stderr, " lock gone away ... ");
      } else {
	int pid = atoi(pid_string);

	if(pid == base->pid) {
	  fprintf(stderr, "webbase_lock: %s locked again\n", name);
	  exit(1);
	} else {
	  if(kill(pid, 0) < 0) {
	    if(verbose) fprintf(stderr, " process died, cleanup ... ");
	    sprintf(query2, "delete low_priority from locks where name = '%.*s'", md5_length, md5);
	    smysql_query(&base->mysql, query2);
	  } else {
	    if(verbose) fprintf(stderr, " sleep %d ... ", base->lock_wait);
	    sleep(base->lock_wait);
	  }
	}
      }
    } else {
      if(verbose > 1) fprintf(stderr, " locked\n");
      locked = 1;
    }
  }
  return 1;
}

void webbase_unlock(webbase_t* base, int varint, char* varchar)
{
  char query[1024];
  char name[512];
  int name_length;
  unsigned char* md5;
  int md5_length;

  sprintf(name, "%d%s", varint, varchar);
  name_length = strlen(name);

  {
    char* ignore = getenv("WEBBASE_IGNORE");
    if(ignore && !strcmp(name, ignore)) {
      if(verbose) fprintf(stderr, "\tlock ignored %s\n", name);
      return ;
    }
  }

  if(name_length <= MD5_ASCII_SIZE) {
    md5 = (unsigned char*)name;
    md5_length = name_length;
  } else {
    md5 = str2md5ascii_simple(name, strlen(name));
    md5_length = MD5_ASCII_SIZE;
  }

  sprintf(query, "delete low_priority from locks where name = '%.*s' and pid = %d", md5_length, md5, base->pid);

  if(verbose > 1) fprintf(stderr, "\tunlocking %s ...", name);
  smysql_query(&base->mysql, query);
  if(mysql_affected_rows(&base->mysql) == 0) {
    char pid_string[32];
    if(verbose) fprintf(stderr, "not found ");
    sprintf(query, "select high_priority pid from locks where name = '%.*s'", md5_length, md5);
    if(sql_select_value(&base->mysql, query, pid_string, WEBBASE_INTEGER_VALUE_SIZE) == 0) {
      if(verbose) fprintf(stderr, ", no row ");
    } else {
      if(verbose) fprintf(stderr, ", held by %s ", pid_string);
    }
    fprintf(stderr, "webbase_unlock: failed to unlock %s\n", name);
    abort();
    exit(1);
  }
  if(verbose > 1) fprintf(stderr, " done\n");
}

/*
 * start functions
 */
void webbase_default_start(webbase_t*, webbase_url_start_t* start)
{
  start->url_max_size = WEBBASE_URL_START_DEFAULT_URL_MAX_SIZE;
  start->size_hrefs = WEBBASE_URL_START_DEFAULT_SIZE_HREFS;
  start->min = WEBBASE_URL_START_DEFAULT_MIN;
  start->depth = WEBBASE_URL_START_DEFAULT_DEPTH;
  start->level = WEBBASE_URL_START_DEFAULT_LEVEL;
  start->timeout = WEBBASE_URL_START_DEFAULT_TIMEOUT;
  start->loaded_delay = WEBBASE_URL_START_DEFAULT_LOADED_DELAY;
  start->modified_delay = WEBBASE_URL_START_DEFAULT_MODIFIED_DELAY;
  start->not_found_delay = WEBBASE_URL_START_DEFAULT_NOT_FOUND_DELAY;
  start->timeout_delay = WEBBASE_URL_START_DEFAULT_TIMEOUT_DELAY;
  start->robot_delay = WEBBASE_URL_START_DEFAULT_ROBOT_DELAY;
}

void webbase_walk_start(webbase_t* base, char* where, webbase_walk_start_callback_t func, char* params)
{
  static char* query = 0;
  static int query_size = 0;
  MYSQL_RES *res;
  MYSQL_ROW row;
  int where_length = strlen(where);

  static_alloc(&query, &query_size, 256 + where_length);
  sprintf(query, "select rowid from start %s", where);

  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);
  
  while((row = mysql_fetch_row(res))) {
    int rowid;
    if(!row[0]) {
      fprintf(stderr, "webbase_walk_start: unexpected rowid field == NULL\n");
      exit(1);
    }
    rowid = atoi(row[0]);
    (*func)(params, webbase_get_start_rowid(base, rowid, 0));
#ifdef WEBBASE_LOCK
    webbase_unlock(base, rowid, "start");
#endif /* WEBBASE_LOCK */
  }

  if (!mysql_eof(res)) {
    fprintf(stderr, "webbase_walk eof: %s\n", mysql_error(&base->mysql));
    exit(1);
  }

  mysql_free_result(res);
}

void webbase_walk_url(webbase_t* base, char* where, webbase_walk_url_callback_t func, char* params, int flag)
{
  static char* query = 0;
  static int query_size = 0;
  MYSQL_RES *res;
  MYSQL_ROW row;
  int where_length = strlen(where);

  static_alloc(&query, &query_size, 256 + where_length);
  sprintf(query, "select rowid from url %s", where);
  if(verbose) fprintf(stderr, "webbase_walk_url: %s\n", query);

  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);
  
  while((row = mysql_fetch_row(res))) {
    int rowid;
    if(!row[0]) {
      fprintf(stderr, "webbase_walk_url: unexpected rowid field == NULL\n");
      exit(1);
    }
    rowid = atoi(row[0]);
    (*func)(params, webbase_get_url_rowid(base, rowid, 0, flag));
#ifdef WEBBASE_LOCK
    webbase_unlock(base, rowid, "url");
#endif /* WEBBASE_LOCK */
  }

  if (!mysql_eof(res)) {
    fprintf(stderr, "webbase_walk eof: %s\n", mysql_error(&base->mysql));
    exit(1);
  }

  mysql_free_result(res);
}

webbase_url_start_t* webbase_get_start_of_url(webbase_t* base, webbase_url_t* webbase_url)
{
  int level = 0;
  int start_rowid = 0;
  webbase_url_start_t* start = 0;

  if(webbase_get_start2url(base, &start_rowid, &webbase_url->w_rowid, &level) == 0)
    return 0;

  return webbase_get_start_rowid(base, start_rowid, start);
}

void webbase_enable_start(webbase_t* base, unsigned char* url_md5)
{
  static char* query = 0;
  static int query_size = 0;

  static_alloc(&query, &query_size, 256 + MD5_ASCII_SIZE);

  sprintf(query, "update start set disabled = 'no' where url_md5 = '%.*s'", MD5_ASCII_SIZE, url_md5);
  smysql_query(&base->mysql, query);
}

webbase_url_start_t* webbase_get_start(webbase_t* base, unsigned char* url_md5, webbase_url_start_t* start)
{
  return webbase_get_start_1(base, "'", "url_md5", url_md5, MD5_ASCII_SIZE, start);
}

webbase_url_start_t* webbase_get_start_rowid(webbase_t* base, int rowid, webbase_url_start_t* start)
{
  char rowid_string[32];
  sprintf(rowid_string, "%d", rowid);
  return webbase_get_start_1(base, "", "rowid", (unsigned char*)rowid_string, strlen(rowid_string), start);
}

static webbase_url_start_t* webbase_get_start_1(webbase_t* base, char* quote, char* field, unsigned char* value, int value_length, webbase_url_start_t* start)
{
  MYSQL_RES *res;
  MYSQL_ROW row;
  static char* query = 0;
  static int query_size = 0;

  static_alloc(&query, &query_size, 512 + value_length);

  sprintf(query, "select " WEBBASE_START_FIELDS " from start where %s = %s%.*s%s and disabled = 'no'", field, quote, value_length, value, quote);
  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);
  if(mysql_num_rows(res) > 0) {
    int rowid;
    row = mysql_fetch_row(res);
    if(!row[0]) {
      fprintf(stderr, "webbase_get_start_1: unexpected rowid field == NULL\n");
      exit(1);
    }
    rowid = atoi(row[0]);
#ifdef WEBBASE_LOCK
    if(webbase_lock(base, rowid, "start"))
#endif /* WEBBASE_LOCK */
      start = webbase_decode_start(base, res, row, start);
#ifdef WEBBASE_LOCK
    else
      start = 0;
#endif /* WEBBASE_LOCK */
  } else {
    start = 0;
  }
  mysql_free_result(res);
  return start;
}

void webbase_update_start(webbase_t* base, webbase_url_start_t* start)
{
  static char* query = 0;
  static int query_size = 0;
  static char* accept = 0;
  static int accept_size = 0;
  static char* filter = 0;
  static int filter_size = 0;
  static char* allow = 0;
  static int allow_size = 0;
  static char* disallow = 0;
  static int disallow_size = 0;
  static char* regex_allow = 0;
  static int regex_allow_size = 0;
  static char* regex_disallow = 0;
  static int regex_disallow_size = 0;
  static char* hook_info = 0;
  static int hook_info_size = 0;
  static char* url = 0;
  static int url_size = 0;
  static char* url_md5 = 0;
  static int url_md5_size = 0;
  int has_accept = start->info & WEBBASE_URL_START_ACCEPT;
  int has_filter = start->info & WEBBASE_URL_START_FILTER;
  int has_allow = start->info & WEBBASE_URL_START_ALLOW;
  int has_disallow = start->info & WEBBASE_URL_START_DISALLOW;
  int has_regex_allow = start->info & WEBBASE_URL_START_REGEX_ALLOW;
  int has_regex_disallow = start->info & WEBBASE_URL_START_REGEX_DISALLOW;
  int has_hook_info = start->info & WEBBASE_URL_START_HOOK_INFO;

  webbase_url_start_t* default_start = &base->default_start;

#define MINIMUM_QUERY_SIZE (20 * 1024)  
  static_alloc(&query, &query_size, MINIMUM_QUERY_SIZE);

  if(has_accept)
    sql_quote_char(&accept, &accept_size, start->accept, strlen(start->accept));
  if(has_filter)
    sql_quote_char(&filter, &filter_size, start->filter, strlen(start->filter));
  if(has_allow)
    sql_quote_char(&allow, &allow_size, start->allow, strlen(start->allow));
  if(has_disallow)
    sql_quote_char(&disallow, &disallow_size, start->disallow, strlen(start->disallow));
  if(has_regex_allow)
    sql_quote_char(&regex_allow, &regex_allow_size, start->regex_allow, strlen(start->regex_allow));
  if(has_regex_disallow)
    sql_quote_char(&regex_disallow, &regex_disallow_size, start->regex_disallow, strlen(start->regex_disallow));
  if(has_hook_info)
    sql_quote_char(&hook_info, &hook_info_size, start->hook_info, strlen(start->hook_info));

  static_alloc(&query, &query_size, MINIMUM_QUERY_SIZE +
	       url_size +
	       accept_size +
	       filter_size +
	       allow_size +
	       disallow_size +
	       regex_allow_size +
	       regex_disallow_size +
	       hook_info_size);

  if(start->rowid != 0) {
    sprintf(query, "update start set url = '%s', url_md5 = '%s', info = %d",
	    sql_quote_char(&url, &url_size, start->url, strlen(start->url)),
	    sql_quote_char(&url_md5, &url_md5_size, (char*)start->url_md5, MD5_ASCII_SIZE),
	    start->info);
#define S(field) \
    strcat(query, ", " #field " = "); \
    strcat(query, start->field == default_start->field ? "null" : sql_itoa(start->field))
    S(url_max_size);
    S(size_hrefs);
    S(min);
    S(depth);
    S(level);
    S(timeout);
    S(loaded_delay);
    S(modified_delay);
    S(not_found_delay);
    S(timeout_delay);
    S(robot_delay);
    S(delay);
#undef S
    sprintf(query + strlen(query), ", accept = %c%s%c, filter = %c%s%c, allow = %c%s%c, disallow = %c%s%c, regex_allow = %c%s%c, regex_disallow = %c%s%c, hook_info = %c%s%c, count = %d",
#define S(f,hasf) (hasf ? '\'' : ' '), (hasf ? f : "null"), (hasf ? '\'' : ' ')
	  S(accept,has_accept),
	  S(filter,has_filter),
	  S(allow,has_allow),
	  S(disallow,has_disallow),
	  S(regex_allow,has_regex_allow),
	  S(regex_disallow,has_regex_disallow),
	  S(hook_info,has_hook_info),
	  start->count
	  );
#undef S
    sprintf(query + strlen(query), " where rowid = %d", start->rowid);
    if(verbose > 5) fprintf(stderr, "update start: %s\n", query);
  } else {
    server_add(base->servers, start->url);
    sprintf(query, "replace into start ("WEBBASE_START_FIELDS_INSERT") values (%d, '%s', '%s', %d, ",
	    start->rowid,
	    sql_quote_char(&url, &url_size, start->url, strlen(start->url)),
	    sql_quote_char(&url_md5, &url_md5_size, (char*)start->url_md5, MD5_ASCII_SIZE),
	    start->info);
#define S(field) \
    strcat(query, start->field == default_start->field ? "null" : sql_itoa(start->field)); \
    strcat(query, ", ")
    S(url_max_size);
    S(size_hrefs);
    S(min);
    S(depth);
    S(level);
    S(timeout);
    S(loaded_delay);
    S(modified_delay);
    S(not_found_delay);
    S(timeout_delay);
    S(robot_delay);
    S(delay);
#undef S
    sprintf(query + strlen(query), "%c%s%c, %c%s%c, %c%s%c, %c%s%c, %c%s%c, %c%s%c, %c%s%c, %d",
#define S(f,hasf) (hasf ? '\'' : ' '), (hasf ? f : "null"), (hasf ? '\'' : ' ')
	  S(accept,has_accept),
	  S(filter,has_filter),
	  S(allow,has_allow),
	  S(disallow,has_disallow),
	  S(regex_allow,has_regex_allow),
	  S(regex_disallow,has_regex_disallow),
	  S(hook_info,has_hook_info),
	  start->count
	  );
#undef S
    strcat(query, ")");
    if(verbose > 5) fprintf(stderr, "insert start: %s\n", query);
  }

  smysql_query(&base->mysql, query);
  if(start->rowid == 0) {
    start->rowid = mysql_insert_id(&base->mysql);
  }
#undef MINIMUM_QUERY_SIZE
}

void webbase_merge_start(webbase_url_start_t* to, webbase_url_start_t* from)
{
  if(from->url_max_size >= 0) to->url_max_size = from->url_max_size;
  if(from->size_hrefs >= 0) to->size_hrefs = from->size_hrefs;
  if(from->min >= 0) to->min = from->min;
  if(from->depth > 0) to->depth = from->depth;
  if(from->level > 0) to->level = from->level;
  if(from->timeout >= 0) to->timeout = from->timeout;
  if(from->loaded_delay != '\377') to->loaded_delay = from->loaded_delay;
  if(from->modified_delay != '\377') to->modified_delay = from->modified_delay;
  if(from->not_found_delay != '\377') to->not_found_delay = from->not_found_delay;
  if(from->timeout_delay != '\377') to->timeout_delay = from->timeout_delay;
  if(from->robot_delay != '\377') to->robot_delay = from->robot_delay;

#define S(field,field_length,flag) \
  if(from->field && (from->info & flag)) { \
    static_alloc(&to->field, &to->field_length, from->field_length); \
    strcpy(to->field, from->field); \
    to->info |= flag; \
  }
  S(accept,accept_length,WEBBASE_URL_START_ACCEPT);
  S(filter,filter_length,WEBBASE_URL_START_FILTER);
  S(allow,allow_length,WEBBASE_URL_START_ALLOW);
  S(disallow,disallow_length,WEBBASE_URL_START_DISALLOW);
  S(regex_allow,regex_allow_length,WEBBASE_URL_START_REGEX_ALLOW);
  S(regex_disallow,regex_disallow_length,WEBBASE_URL_START_REGEX_DISALLOW);
  S(hook_info,hook_info_length,WEBBASE_URL_START_HOOK_INFO);
#undef S
}

void webbase_start_free(webbase_url_start_t* start)
{
  if(start->accept) free(start->accept);
  if(start->filter) free(start->filter);
  if(start->allow) free(start->allow);
  if(start->disallow) free(start->disallow);
  /*
   * added deallocation of regex_allow and regex_disallow (purify)
   * by benoit orihuela (benoit.orihuela@IDEALX.com), 2000/08/04
   */
  if(start->regex_allow) free(start->regex_allow);
  if(start->regex_disallow) free(start->regex_disallow); 
  if(start->hook_info) free(start->hook_info);
}

void webbase_start_reset(webbase_url_start_t* start)
{
  start->url[0] = '\0';
  memset(start->url_md5, '\0', sizeof(start->url_md5));
  start->info = 0;
  start->url_max_size = -1;
  start->size_hrefs = -1;
  start->min = -1;
  start->depth = -1;
  start->level = -1;
  start->timeout = -1;
  start->loaded_delay = '\377';
  start->modified_delay = '\377';
  start->not_found_delay = '\377';
  start->timeout_delay = '\377';
  start->robot_delay = '\377';
  if(start->accept) start->accept[0] = '\0';
  if(start->filter) start->filter[0] = '\0';
  if(start->allow) start->allow[0] = '\0';
  if(start->disallow) start->disallow[0] = '\0';
  if(start->regex_allow) start->regex_allow[0] = '\0';
  if(start->regex_disallow) start->regex_disallow[0] = '\0';
  if(start->hook_info) start->hook_info[0] = '\0';

  start->count = 0;
  start->rowid = 0;
  start->delay = 0;
}

static webbase_url_start_t* webbase_decode_start(webbase_t* base, MYSQL_RES *res, MYSQL_ROW row, webbase_url_start_t* start)
{
  static webbase_url_start_t start_object;
  webbase_url_start_t* default_start = &base->default_start;
  unsigned int i;
  /*
   * Set if url_md5 field found and not empty.
   */
  int url_md5_set = 0;

  if(!start)
    start = &start_object;

  webbase_start_reset(start);

  for(i = 0; i < mysql_num_fields(res); i++) {
    MYSQL_FIELD* field;
    mysql_field_seek(res, i);
    field = mysql_fetch_field(res);
    if(!strcmp(field->name, "rowid")) {
      if(row[i]) start->rowid = atoi(row[i]);
    } else if(!strcmp(field->name, "url")) {
      if(row[i]) {
	strncpy(start->url, row[i], WEBBASE_URL_LENGTH);
	start->url[WEBBASE_URL_LENGTH] = '\0';
      }
    } else if(!strcmp(field->name, "info+0")) {
      if(row[i]) start->info = atoi(row[i]);
#define S(f) \
    } else if(!strcmp(field->name, #f)) { \
      int value = row[i] ? atoi(row[i]) : -1; \
      start->f = value >= 0 ? value : default_start->f; 
    S(url_max_size);
    S(size_hrefs);
    S(min);
    S(depth);
    S(level);
    S(timeout);
    S(loaded_delay);
    S(modified_delay);
    S(not_found_delay);
    S(timeout_delay);
    S(robot_delay);
    S(delay);
#undef S
#define S(f,f_length,flag) \
    } else if(!strcmp(field->name, #f)) { \
      if(row[i]) { \
	int length = strlen(row[i]); \
	static_alloc(&start->f, &start->f_length, length + 1); \
	strcpy(start->f, row[i]); \
        start->info |= flag; \
      }
    S(accept, accept_length, WEBBASE_URL_START_ACCEPT);
    S(filter, filter_length, WEBBASE_URL_START_FILTER);
    S(allow, allow_length, WEBBASE_URL_START_ALLOW);
    S(disallow, disallow_length, WEBBASE_URL_START_DISALLOW);
    S(regex_allow, regex_allow_length, WEBBASE_URL_START_REGEX_ALLOW);
    S(regex_disallow, regex_disallow_length, WEBBASE_URL_START_REGEX_DISALLOW);
    S(hook_info, hook_info_length, WEBBASE_URL_START_HOOK_INFO);
#undef S
    } else if(!strcmp(field->name, "count")) {
      start->count = row[i] ? atoi(row[i]) : 0;
    } else if(!strcmp(field->name, "url_md5")) {
      if(row[i] && field->length == MD5_ASCII_SIZE) {
	url_md5_set = 1;
	memcpy(start->url_md5, row[i], sizeof(start->url_md5));
      }
    }
  }

  if(!url_md5_set)
    str2md5ascii(start->url, strlen(start->url), start->url_md5);

  if(verbose > 1) fprintf(stderr, "decode_start: %s\n", start->url);
  return start;
}

void webbase_start_state(webbase_t* base, webbase_url_start_t* start, int state)
{
  start->info = (start->info & ~WEBBASE_URL_START_STATE_MASK) | state;

  webbase_update_start(base, start);
}

/*
 * url functions
 */

int webbase_insert_url(webbase_t* base, webbase_url_t* webbase_url)
{
  static char* query = 0;
  static int query_size = 0;
  int complete_rowid = 0;

#define MINIMUM_QUERY_SIZE (20 * 1024)  
  static_alloc(&query, &query_size, MINIMUM_QUERY_SIZE);

  if(webbase_url->w_info & WEBBASE_URL_INFO_COMPLETE) {
    /*
     * Insert or replace data.
     */
    static char* relative = 0;
    static int relative_size = 0;
    static char* absolute = 0;
    static int absolute_size = 0;
    static char* location = 0;
    static int location_size = 0;
    static char* base_url = 0;
    static int base_url_size = 0;
    static char* keywords = 0;
    static int keywords_size = 0;
    static char* description = 0;
    static int description_size = 0;
    int has_relative = webbase_url->w_info & WEBBASE_URL_INFO_RELATIVE;
    int has_absolute = webbase_url->w_info & WEBBASE_URL_INFO_ABSOLUTE;
    int has_base = webbase_url->w_info & WEBBASE_URL_INFO_BASE;
    int has_location = webbase_url->w_info & WEBBASE_URL_INFO_LOCATION;
    int has_keywords = webbase_url->w_info & WEBBASE_URL_INFO_KEYWORDS;
    int has_description = webbase_url->w_info & WEBBASE_URL_INFO_DESCRIPTION;

    if(has_relative)
      sql_quote_char(&relative, &relative_size, webbase_url->w_relative, strlen(webbase_url->w_relative)); 
    if(has_absolute)
      sql_quote_char(&absolute, &absolute_size, webbase_url->w_absolute, strlen(webbase_url->w_absolute)); 
    static_alloc(&query, &query_size,
		 MINIMUM_QUERY_SIZE +
		 webbase_url->w_url_length +
		 webbase_url->w_base_url_length +
		 webbase_url->w_location_length +
		 webbase_url->w_relative_length +
		 webbase_url->w_absolute_length
		 );

#define S(f,hasf) (hasf ? '\'' : ' '), (hasf ? f : "null"), (hasf ? '\'' : ' ')
#define T(f,ffield,fsize,hasf) (hasf ? '\'' : ' '), (hasf ? sql_quote_char(&f, &fsize, webbase_url->ffield, strlen(webbase_url->ffield)) : "null"), (hasf ? '\'' : ' ')

    sprintf(query, "replace into url_complete values (%d, %c%s%c, %c%s%c, %c%s%c, %c%s%c, %c%s%c, %c%s%c)",
	    webbase_url->w_complete_rowid,
	    T(keywords, w_keywords, keywords_size, has_keywords),
	    T(description, w_description, description_size, has_description),
	    T(base_url, w_base_url, base_url_size, has_base),
	    S(relative,has_relative),
	    S(absolute,has_absolute),
	    T(location, w_location, location_size, has_location)
	    );
    if(verbose > 5) fprintf(stderr, "replace complete: %s\n", query);
    smysql_query(&base->mysql, query);
    complete_rowid = mysql_insert_id(&base->mysql);
  } else if(webbase_url->w_complete_rowid) {
    /*
     * After computation the URL has not 'complete' information anymore, 
     * remove.
     */
    sprintf(query, "delete from url_complete where rowid = %d", webbase_url->w_complete_rowid);
    smysql_query(&base->mysql, query);
  }
    
  {
    static char* url = 0;
    static int url_size = 0;
    static char* url_md5 = 0;
    static int url_md5_size = 0;
    static char* md5 = 0;
    static int md5_size = 0;
    static char* extract = 0;
    static int extract_size = 0;
    static char* title = 0;
    static int title_size = 0;
    int has_extract = webbase_url->w_info & WEBBASE_URL_INFO_EXTRACT;
    int has_title = webbase_url->w_info & WEBBASE_URL_INFO_TITLE;

    sprintf(query, "replace into url values (%d, '%s', '%s', %d, %d, FROM_UNIXTIME(%ld), FROM_UNIXTIME(%ld), %d, '%s', %d, '%s', %d, FROM_UNIXTIME(%ld), %d, '%s', %c%s%c, %c%s%c)",
	    webbase_url->w_rowid,
	    sql_quote_char(&url, &url_size, webbase_url->w_url, strlen(webbase_url->w_url)),
	    sql_quote_char(&url_md5, &url_md5_size, (char*)webbase_url->w_url_md5, MD5_ASCII_SIZE), 
	    webbase_url->w_info,
	    webbase_url->w_code,
	    (long)webbase_url->w_mtime,
	    (long)webbase_url->w_mtime_error,
	    webbase_url->w_tags,
	    webbase_url->w_content_type,
	    webbase_url->w_content_length,
	    sql_quote_char(&md5, &md5_size, (char*)webbase_url->w_md5, MD5_ASCII_SIZE),
	    complete_rowid,
	    (long)webbase_url->w_crawl,
	    webbase_url->w_hookid,
	    webbase_url->w_language,
	    T(extract, w_extract, extract_size, has_extract),
	    T(title, w_title, title_size, has_title)
	    );
    if(verbose > 5) fprintf(stderr, "insert url: %s\n", query);
    smysql_query(&base->mysql,query);
  }
#undef S
#undef T
#undef MINIMUM_QUERY_SIZE
  return 0;
}

webbase_url_t* webbase_get_url(webbase_t* base, unsigned char* url_md5, webbase_url_t* webbase_url, int flag)
{
  return webbase_get_url_1(base, "'", "url_md5", url_md5, MD5_ASCII_SIZE, webbase_url, flag);
}

webbase_url_t* webbase_get_url_rowid(webbase_t* base, int rowid, webbase_url_t* webbase_url, int flag)
{
  char rowid_string[32];
  sprintf(rowid_string, "%d", rowid);
  return webbase_get_url_1(base, "", "rowid", (unsigned char*)rowid_string, strlen(rowid_string), webbase_url, flag);
}

webbase_url_t* webbase_get_url_1(webbase_t* base, char* quote, char* field, unsigned char* value, int value_length, webbase_url_t* webbase_url, int flag)
{
  MYSQL_RES *res;
  MYSQL_ROW row;
  static char* query = 0;
  static int query_size = 0;
  int found = 1;
  static webbase_url_t webbase_url_object;

  memset((char*)&row, '\0', sizeof(MYSQL_ROW));

  if(!webbase_url) {
    webbase_url = &webbase_url_object;
  }
  
  webbase_url_reset(webbase_url, WEBBASE_URL_RESET_TOTAL);

  static_alloc(&query, &query_size, 512 + value_length);

  sprintf(query, "select "WEBBASE_URL_FIELDS" from url where %s = %s%.*s%s", field, quote, value_length, value, quote);
  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);
  if(mysql_num_rows(res) > 0) {
    row = mysql_fetch_row(res);
  } else {
    found = 0;
  }

  if(found) {
    int rowid;
    if(!row[0]) {
      fprintf(stderr, "webbase_get_url_1: unexpected rowid field == NULL\n");
      exit(1);
    }
    rowid = atoi(row[0]);
#ifdef WEBBASE_LOCK
    webbase_lock(base, rowid, "url");
#endif /* WEBBASE_LOCK */

    webbase_url = webbase_decode_url(base, res, row, webbase_url);
    mysql_free_result(res);
    /*
     * Load additional informations, if any
     */
    if(webbase_url->w_complete_rowid && (flag & WEBBASE_GET_URL_ALL)) {
      sprintf(query, "select "WEBBASE_URL_COMPLETE_FIELDS" from url_complete where rowid = %d", webbase_url->w_complete_rowid);
      smysql_query(&base->mysql, query);
      res = smysql_store_result(&base->mysql);
      if(mysql_num_rows(res) != 1) {
	fprintf(stderr, "webbase_get: url_complete with rowid = %d not found for %s\n", webbase_url->w_complete_rowid, webbase_url->w_url);
	exit(1);
      }
      row = mysql_fetch_row(res);
      webbase_url = webbase_decode_url(base, res, row, webbase_url);
      mysql_free_result(res);
    }
  } else {
    mysql_free_result(res);
    webbase_url = 0;
  }
  return webbase_url;
}

int webbase_exists_url(webbase_t* base, unsigned char* url_md5)
{
  MYSQL_RES *res;
  MYSQL_ROW row;
  int rowid = 0;
  static char* query = 0;
  static int query_size = 0;

  static_alloc(&query, &query_size, 128 + MD5_ASCII_SIZE);

  sprintf(query, "select rowid from url where url_md5 = '%.*s'", MD5_ASCII_SIZE, url_md5);

  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);
  if(mysql_num_rows(res) > 0) {
    row = mysql_fetch_row(res);
    rowid = atoi(row[0]);
  }

  mysql_free_result(res);

  return rowid;
}

int webbase_visited(webbase_t* base, char* url)
{
  static char* query = 0;
  static int query_size = 0;
  unsigned char url_md5[MD5_ASCII_SIZE];
  char* quoted_url = sql_quote_char_simple(url, strlen(url));
  int quoted_url_length = strlen(quoted_url);
  int rowid;
  static char default_row[] = "insert into url (url, url_md5, info, code, tags, crawl) values ('%s', '%.*s', 0, 0, 0, 19700101000000)";

  static_alloc(&query, &query_size, 256 + MD5_ASCII_SIZE + quoted_url_length);
  str2md5ascii(url, strlen(url), url_md5);

  sprintf(query, default_row, quoted_url, MD5_ASCII_SIZE, url_md5);
  if(verbose > 5) fprintf(stderr, "webbase_visited: %s\n", query);
  smysql_query(&base->mysql, query);
  rowid = mysql_insert_id(&base->mysql);

  return rowid;
}

static webbase_url_t* webbase_decode_url(webbase_t* , MYSQL_RES *res, MYSQL_ROW row, webbase_url_t* webbase_url)
{
  unsigned int i;
  /*
   * Set if url_md5 field found and not empty.
   */
  int url_md5_set = 0;

  for(i = 0; i < mysql_num_fields(res); i++) {
    MYSQL_FIELD* field;
    if(row[i] == 0) continue;
    mysql_field_seek(res, i);
    field = mysql_fetch_field(res);
    if(!strcmp(field->name, "rowid")) {
      webbase_url->w_rowid = atoi(row[i]);
    } else if(!strcmp(field->name, "url")) {
      int length = strlen(row[i]);
      static_alloc(&webbase_url->w_url, &webbase_url->w_url_length, length + 1);
      strcpy(webbase_url->w_url, row[i]);
    } else if(!strcmp(field->name, "url_md5")) {
      if(row[i] && field->length == MD5_ASCII_SIZE) {
	url_md5_set = 1;
	memcpy(webbase_url->w_url_md5, row[i], MD5_ASCII_SIZE);
      }
    } else if(!strcmp(field->name, "info+0")) {
      webbase_url->w_info = atoi(row[i]);
    } else if(!strcmp(field->name, "code")) {
      webbase_url->w_code = atoi(row[i]);
    } else if(!strcmp(field->name, "unix_timestamp(mtime)")) {
      webbase_url->w_mtime = atol(row[i]);
    } else if(!strcmp(field->name, "unix_timestamp(mtime_error)")) {
      webbase_url->w_mtime_error = atol(row[i]);
    } else if(!strcmp(field->name, "tags")) {
      webbase_url->w_tags = atoi(row[i]);
    } else if(!strcmp(field->name, "content_type")) {
      strncpy(webbase_url->w_content_type, row[i], WEBBASE_CONTENT_TYPE_LENGTH);
      webbase_url->w_content_type[WEBBASE_CONTENT_TYPE_LENGTH] = '\0';
    } else if(!strcmp(field->name, "content_length")) {
      webbase_url->w_content_length = atoi(row[i]);
    } else if(!strcmp(field->name, "md5")) {
      memcpy(webbase_url->w_md5, row[i], MD5_ASCII_SIZE);
    } else if(!strcmp(field->name, "complete_rowid")) {
      webbase_url->w_complete_rowid = atoi(row[i]);
    } else if(!strcmp(field->name, "unix_timestamp(crawl)")) {
      /*
       * Fix negative date : 1970010100 may be negative if timezone substracts
       * too much.
       */
      webbase_url->w_crawl = atol(row[i][0] == '-' ? row[i] + 1 : row[i]);
    } else if(!strcmp(field->name, "hookid")) {
      webbase_url->w_hookid = atoi(row[i]);
    } else if(!strcmp(field->name, "language")) {
      strncpy(webbase_url->w_language, row[i], WEBBASE_LANGUAGE_LENGTH);
      webbase_url->w_language[WEBBASE_LANGUAGE_LENGTH] = '\0';
    } else if(!strcmp(field->name, "extract")) {
      strncpy(webbase_url->w_extract, row[i], WEBBASE_EXTRACT_LENGTH);
      webbase_url->w_extract[WEBBASE_EXTRACT_LENGTH] = '\0';
    } else if(!strcmp(field->name, "title")) {
      strncpy(webbase_url->w_title, row[i], WEBBASE_TITLE_LENGTH);
      webbase_url->w_title[WEBBASE_TITLE_LENGTH] = '\0';
    } else if(!strcmp(field->name, "keywords")) {
      strncpy(webbase_url->w_keywords, row[i], WEBBASE_KEYWORDS_LENGTH);
      webbase_url->w_keywords[WEBBASE_KEYWORDS_LENGTH] = '\0';
    } else if(!strcmp(field->name, "description")) {
      strncpy(webbase_url->w_description, row[i], WEBBASE_DESCRIPTION_LENGTH);
      webbase_url->w_description[WEBBASE_DESCRIPTION_LENGTH] = '\0';
    } else if(!strcmp(field->name, "base_url")) {
      int length = strlen(row[i]);
      static_alloc(&webbase_url->w_base_url, &webbase_url->w_base_url_length, length + 1);
      strcpy(webbase_url->w_base_url, row[i]);
    } else if(!strcmp(field->name, "relative")) {
      int length = strlen(row[i]);
      static_alloc(&webbase_url->w_relative, &webbase_url->w_relative_length, length + 1);
      strcpy(webbase_url->w_relative, row[i]);
    } else if(!strcmp(field->name, "absolute")) {
      int length = strlen(row[i]);
      static_alloc(&webbase_url->w_absolute, &webbase_url->w_absolute_length, length + 1);
      strcpy(webbase_url->w_absolute, row[i]);
    } else if(!strcmp(field->name, "location")) {
      int length = strlen(row[i]);
      static_alloc(&webbase_url->w_location, &webbase_url->w_location_length, length + 1);
      strcpy(webbase_url->w_location, row[i]);
    }
  }

  if(!url_md5_set)
    str2md5ascii(webbase_url->w_url, strlen(webbase_url->w_url), webbase_url->w_url_md5);

  if(verbose > 1) fprintf(stderr, "decode_url: %s\n", webbase_url->w_url);
  return webbase_url;
}

/*
 * start2url functions
 */

int webbase_insert_start2url(webbase_t* base, int start_rowid, int url_rowid, int level)
{
  char query[128];
  sprintf(query, "replace into start2url values (%d, %d, %d)", start_rowid, url_rowid, level);
  if(verbose > 5) fprintf(stderr, "webbase_start2url: %s\n", query);
  smysql_query(&base->mysql, query);
  return mysql_insert_id(&base->mysql);
}

int webbase_get_start2url(webbase_t* base, int* start_rowid, int* url_rowid, int* level)
{
  MYSQL_RES *res;
  MYSQL_ROW row;
  char query[128];
  unsigned int i;
  int ret = 0;

  if(*start_rowid && *url_rowid) 
    sprintf(query, "select * from start2url where start = %d and url = %d", *start_rowid, *url_rowid);
  else if(*url_rowid) 
    sprintf(query, "select * from start2url where url = %d", *url_rowid);

  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);

  if(mysql_num_rows(res)) {
    row = mysql_fetch_row(res);

    for(i = 0; i < mysql_num_fields(res); i++) {
      MYSQL_FIELD* field;
      if(row[i] == 0) continue;
      mysql_field_seek(res, i);
      field = mysql_fetch_field(res);
      if(!strcmp(field->name, "start")) {
	*start_rowid = atoi(row[i]);
      } else if(!strcmp(field->name, "url")) {
	*url_rowid = atoi(row[i]);
      } else if(!strcmp(field->name, "level")) {
	*level = atoi(row[i]);
      }
    }
    ret = 1;
  }

  mysql_free_result(res);

  return ret;
}

int webbase_counted(webbase_t* base, webbase_url_start_t* start, int rowid)
{
  int num_rows = 0;
  char query[128];
  MYSQL_RES *res;
  MYSQL_ROW row;

  sprintf(query, "select level from start2url where start = %d and url = %d", start->rowid, rowid);
  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);
  num_rows = mysql_num_rows(res);
  /*
   * If level is -1 this is not a valid entry, consider it non-existent.
   * It is used, for instance, when trying to find out which URLs of 
   * a starting point are out of scope.
   */
  if(num_rows) {
    int level;
    row = mysql_fetch_row(res);
    level = atoi(row[0]);
    if(level < 0)
      num_rows = 0;
  }
  mysql_free_result(res);

  if(verbose) fprintf(stderr, "\twebbase_counted: %s -> %d\n", query, num_rows);

  return num_rows;
}

static void hnode_free(hnode_t *node, void *)
{
  free(node->data);
  free(node);
}

static webbase_t* webbase_alloc_1()
{
  webbase_t* base = (webbase_t*)smalloc(sizeof(webbase_t));
  memset((char*)base, '\0', sizeof(webbase_t));
  base->options = hash_create(33, 0, 0);
  hash_set_allocator(base->options, 0, hnode_free, 0);
  return base;
}
