/*
 * Rewritten as Scout by Jeffrey Fulmer
 *  
 * LinkCheck 
 * Copyright (C) 2000 Inter7 Internet Technologies, Inc.
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
 * 
 */

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <memory.h>
#include <errno.h>
#include <string.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <netdb.h>
#include <netinet/in.h>
#include <arpa/nameser.h>
#include <resolv.h>
#include <signal.h>

#include <joedog/joedog.h>
#include <joedog/joepath.h>

#include "setup.h"
#include "getopt.h"

/* HTML parsing tokens */
#define CONTROL_TOKENS " ="
#define CONTROL_TOKENS_PLUS " =\""

/* url types */
#define LOCAL_HTML          0
#define LOCAL_PIC           1
#define LOCAL_SUBDIR        2
#define REMOTE_URL          3
#define MAILTO_URL          4
#define FTP_URL             5
#define FILE_URL            6
#define NEWS_URL            7
#define GOPHER_URL          8
#define TELNET_URL          9
#define WAIS_URL           10
#define HTTPS_URL          11
#define POWWOW_URL         12

/* buffer sizes */
#define MAX_SMALL_BUF     150
#define MAX_MEDIUM_BUF  10000
#define MAX_HUGE_BUF   400000

/* parent link structure */
typedef struct url_parent {
  char    full_path[100];
  struct  url_parent *next;
} url_parent;

typedef struct url_child {
  void   *child;
  struct  url_child *next;
} url_child;

/* link structure */
typedef struct url_link {
  char   url[MAX_SMALL_BUF];
  char   site[MAX_SMALL_BUF];
  char   full_path[MAX_SMALL_BUF];
  char   directory[MAX_SMALL_BUF];
  char   server[MAX_SMALL_BUF];
  char   modified_date[MAX_SMALL_BUF];
  int    url_type;
  int    dont_follow;
  int    used;
  int    count;
  int    checked;
  int    paged;
  int    return_code;
  int    size;
  struct url_link  *next;
  struct url_link  *parent;
  struct url_child *child;
} url_link;
struct url_link *URLS;

static struct option long_options[] =
{
  { "version",    no_argument,       NULL, 'V' },
  { "help",       no_argument,       NULL, 'h' },
  { "verbose",    no_argument,       NULL, 'v' },
  { "file",       required_argument, NULL, 'f' },
  { "pictures",   no_argument,       NULL, 'p' },
  { "new",        no_argument,       NULL, 'n' }
};

/* globals */
struct url_link TmpUrl;
char   TmpBuf[MAX_HUGE_BUF];
char   HBuf[MAX_MEDIUM_BUF];
char   CBuf[MAX_MEDIUM_BUF];
char   TheSite[MAX_SMALL_BUF];
char   TheSiteAddr[MAX_SMALL_BUF];
char   CurDir[MAX_SMALL_BUF];
char   TheSubDir[MAX_SMALL_BUF];
char   filename[MAX_SMALL_BUF];
int    filebool;
int    newbool;
int    Virgin;
int    sock;
int    port;
int    verbose;
int    pictures;
int    CheckRemote;
int    CGIRun;
int    MaxUrls;

static union { HEADER hdr; unsigned char buf[PACKETSZ]; } response;
extern char *version_string;

/* prototypes */
struct url_link *add_url_link();
char *http_error_string();
char *comma_string_int();
char *time_string_int();
char *strstart();
char *safe_getenv();
void display_version( int i );
void display_help();
void parse_cmdline( int, char *[] );

main( int argc, char *argv[] ) 
{
  /* init things */
  init_all();

  /* get the command line options */
  parse_cmdline( argc, argv );

  /* display information */
  write( 1, "** ", 3 );
  display_version( FALSE );
  printf( "** Scouting %s for a siege.\n", TheSite );

  /* get the site and walk all the links */
  get_site();

  /* print out the report */
  write_to_file();
  write( 1, "done.\n", 6 );

  exit( EXIT_SUCCESS );
}

/*
 * Connect to a web server
 * Input:       server name
 * Returns:  	0  on successful connection
 *              -1 on failure
 * Globals Set: sock is set to the socket descriptor
 */
connect_to_server(char *site)
{
  struct hostent *hostEntry;
  struct hostent *clientEntry;
  struct sockaddr_in internetAddr;	
  struct sockaddr_in remoteAddr;	
  int remoteLength = sizeof(remoteAddr);

  if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
    return(-1);
  }
  memset(&internetAddr, 0, sizeof(internetAddr));
  internetAddr.sin_port   = htons(port);
  internetAddr.sin_family = AF_INET; 

  if ((hostEntry=gethostbyname(site))!=(struct hostent *)0) {
    memcpy(&internetAddr.sin_addr, hostEntry->h_addr, hostEntry->h_length);
  } 
  else {
    if ( (internetAddr.sin_addr.s_addr = inet_addr(site)) < 0 ) {
      return(-1);
    }
  }
  if ( TheSiteAddr[0] == 0 && strcmp(site, TheSite) == 0 ) { 
    strcpy( TheSiteAddr, (char *)inet_ntoa(internetAddr.sin_addr));
  }

  /** 
   * connect to remote host 
   */
  if ( connect(sock, (struct sockaddr *)&internetAddr, sizeof(internetAddr)) < 0 ) {
    close(sock);
    return(-1);
  }
  return 0;
}

parse_for_hrefs(char *html_text, struct url_link *urll)
{
  int i;
  int len;
  char *tmpstr;
  char  return_code[4];

  set_return_code( urll, html_text);

  if ( urll->return_code != HTTP_OK ){
    if( urll->return_code != HTTP_MOVED_TEMPORARILY ){
      write( 1, "RETURNED!!\n", 11 );
      return;
    }
  }

  html_lower(html_text);

  tmpstr = html_text;
  memset(HBuf, 0, MAX_MEDIUM_BUF);
  while ( *tmpstr != (char)NULL ) {
    if ( *tmpstr == '<' ) {
        tmpstr++;

      /* skip comments */
      if ( strstart(tmpstr, "!--") == tmpstr )  {
        tmpstr++;
        while(*tmpstr!=(char)NULL) {
          if ( strstart(tmpstr, "-->") == tmpstr ) {
            tmpstr += 3;
            break;
          }
          tmpstr++;
        }
      } 
      else if (strstart(tmpstr,"script") == tmpstr ) {
        i = 0;
        tmpstr++;
        while( *tmpstr != (char)NULL && i < (MAX_MEDIUM_BUF-1)) {
          if ( strstart(tmpstr, "</script>") == tmpstr ) {
            tmpstr+=9;
            break;
          } 
          else {
            HBuf[i] = *tmpstr;
            i++;
            tmpstr++;
          }
        }
        HBuf[i] = 0; 
        parse_control(HBuf, urll);
      } 
      else {
        /* this works with standard html */
        i = 0;
        while( *tmpstr != (char)NULL && *tmpstr != '>' && i < (MAX_MEDIUM_BUF-1)) {
          HBuf[i] = *tmpstr;
          i++;
          tmpstr++;
        }
        HBuf[i] = 0;
        parse_control(HBuf, urll);
      }
    }
    if ( *tmpstr != (char)NULL ) {
      tmpstr++;
    }
  } 
  return 1;
}

struct url_link *
add_url_link(char *url, url_link *parent_urll)
{
  struct url_link *urll;
  int found;
  char *tmpstr;

  /* zap off the ends of things with #'s in them */
  tmpstr = url;
  while (*tmpstr != 0 ) {
    if ( *tmpstr == '#' ) {
      *tmpstr = 0;
      break;
    }
    tmpstr++;
  }	

  /* break out the data */
  memset(&TmpUrl, 0, sizeof(struct url_link));
  strncpy( TmpUrl.url, url, MAX_SMALL_BUF-1);
  full_out_url(&TmpUrl);

  /* find a date slot or add a new one */
  if ( URLS == NULL ) {
    URLS = (struct url_link *)malloc(sizeof(url_link) );
    if ( URLS == NULL ) {
      joe_fatal("URLS");
    }
    memset( URLS, 0, sizeof(url_link) );
    urll = URLS;
    copy_full_url(urll, &TmpUrl);
    found = 1;
    if ( parent_urll != NULL ) {
      urll->parent = parent_urll;
    }
  } 
  else {
    urll=URLS; 
    found=0;
    while( found == 0 ) {
      if (strncmp(urll->full_path,TmpUrl.full_path, MAX_SMALL_BUF)==0 &&
      strncmp( urll->site, TmpUrl.site, MAX_SMALL_BUF ) == 0 ) {
        found = 1;
      } 
      else {
        if( urll->next == NULL ) {
          break;
	} 
        else {
          urll = urll->next;
        }
      }
    }
  }
  /* add a new slot */
  if ( found == 0 ) {
    urll->next = (struct url_link *)malloc(sizeof(url_link) );
    if ( urll->next == NULL ) {
      joe_fatal( "urll->next" );
    }
    urll = urll->next;
    memset( urll, 0, sizeof(url_link) );
    copy_full_url(urll, &TmpUrl);
    if ( parent_urll != NULL ) {
      urll->parent = parent_urll;
    }
  }
  /* add the hit */
  urll->count++;

  return( urll );

}

write_to_file()
{
  char *mode;
  struct url_link *urll;
  char tmpbuf[100];
  char myfile[MAX_SMALL_BUF];
  FILE *fs;

  mode = ( newbool > 0 ) ? "w" : "a"; 
  if( filebool ){
      fs = fopen( filename, mode );
  }
  else{
      fs = fopen(CNF_FILE, mode );
  }

/********
 for(urll=URLS;urll!=NULL;urll=urll->next) {
   printf( "CODE: %d; PATH: %s\n", urll->url_type, urll->full_path );
 }
*********/
  for(urll=URLS;urll!=NULL;urll=urll->next) {
    if( urll->url_type == LOCAL_HTML ) {
      fprintf(fs, "%s%s\n", TheSite, urll->full_path );
    }
    if( urll->url_type == LOCAL_PIC ){
      fprintf(fs, "%s%s\n", TheSite, urll->full_path );
    }
    if( urll->url_type == LOCAL_SUBDIR ){
      fprintf(fs, "%s%s\n", TheSite, urll->full_path );
    }
    if( urll->url_type == FTP_URL ){
      fprintf( fs, "%s%s\n", TheSite, urll->full_path );
    }
    if( urll->url_type == REMOTE_URL ){
      fprintf( fs, "%s%s\n", TheSite, urll->full_path );
    }
  }
  fclose(fs);
}

get_page_links(struct url_link *urll)
{
  if ( verbose == 1 ) {
    printf("getting page %-30s %s\n", 
      urll->site, 
      urll->full_path
    );
  }
  memset(CurDir,0,MAX_SMALL_BUF);
  strncpy( CurDir, urll->directory, MAX_SMALL_BUF-1);
  if ( http_read_url(urll, 0) > 0 ) {
    set_server(urll);
    set_page_date(urll);
    parse_for_hrefs(TmpBuf, urll);
  } 
}

get_site()
{
  url_link *urll;
  int do_it = TRUE;
  int count = 0;
  while( do_it ){
    do_it = FALSE;
    for(urll=URLS;urll!=NULL;urll=urll->next) {
      if ( urll->paged == 0 ) {
        count++;
      if ( MaxUrls != 0 && count >= MaxUrls ) {
        write_to_file();
        exit( EXIT_FAILURE );
      }
      do_it = TRUE;
      urll->paged = 1;
      switch( urll->url_type ) {
        case LOCAL_HTML:
          get_page_links(urll);
          break;
        case LOCAL_PIC:
          if( pictures )
            check_pic(urll);
          break;
        case HTTPS_URL:
        case LOCAL_SUBDIR:
        case REMOTE_URL:
          check_site(urll);
          break;
        case MAILTO_URL:
          check_mailto(urll);
          break;
        case FTP_URL:
          check_ftp(urll);
          break;
        case FILE_URL:
          check_file(urll);
          break;
        case NEWS_URL:
          check_news(urll);
          break;
        case GOPHER_URL:
          check_gopher(urll);
          break;
        case TELNET_URL:
          check_telnet(urll);
          break;
        case WAIS_URL:
          check_wais(urll);
          break;
        case POWWOW_URL:
          check_powwow(urll);
          break;
        }
      }
    }
  }
}

void
display_version( int i )
{
  /**
   * version_string is defined in version.c
   * adding it to a separate file allows us
   * to parse it in configure.
   */
  #ifdef DEBUG
  printf( "Scout %s: debugging enabled\n", version_string );
  #else
  printf( "Scout %s\n", version_string );
  #endif
  /**
   * if TRUE, exit
   */
  if( i == 1 ){ exit( EXIT_SUCCESS ); }
}  /* end of display_version */

void
display_help()
{
  /**
   * call display_version, but do not exit
   */
  display_version( FALSE );
  printf("Usage: scout [options]\n");
  printf("Options:\n"                    );
  puts("  -V, --version         VERSION, prints version number to screen.");
  puts("  -h, --help            HELP, prints this section.");
  puts("  -v, --verbose         VERBOSE, prints notification to screen.");
  puts("  -f, --file=FILE       FILE, change the configuration file to file." );
  puts("  -n, --new             NEW create a new (overwrite) URLs file." );
  puts("  -p, --pictures        PICS, include images in the output file." );
  /**
   * our work is done, exit nicely
   */
  exit( EXIT_SUCCESS );
}

void
parse_cmdline( int argc, char *argv[] )
{
  int c = 0;
  while ((c = getopt_long (argc, argv, "Vhvnpf:", long_options, (int *)0)) != EOF){
  switch( c ){
      case 'V':
        display_version( TRUE );
        break;
      case 'h':
        display_help();
        exit( EXIT_SUCCESS );
      case 'v':
        verbose = TRUE;
        break;
      case 'f':
        strncpy( filename, optarg, MAX_SMALL_BUF );
        filebool = 1;
        break;
      case 'n':
        newbool  = 1;
        break;
      case 'p':
        pictures = 1;
        break;
    } /** end of switch **/
  }   /** end of while  **/
  set_starting(argv[argc-1]); 
} 

check_site( url_link *urll )
{

  if ( verbose == 1 ) {
    printf("getting site %-30s %s\n", urll->site, urll->full_path);
  }
  if ( CheckRemote == 0 ) {
    urll->return_code = HTTP_OK; 
  } else if ( http_read_url(urll, 1) > 0 ) {
    set_return_code( urll, TmpBuf);
    set_server(urll);
    set_page_date(urll);
    if ( urll->return_code == HTTP_MOVED_TEMPORARILY ) {
      get_page_links(urll);
    }
  }
}

html_lower(char *html_text)
{
  char *tmpstr;
  int in_angle;
  int in_quotes;
  int i;

  tmpstr    = html_text;
  in_angle  = 0;
  in_quotes = 0;
  i = 0;
  while( *tmpstr != 0 ) {
    if ( in_angle == 1 ) {
      if ( in_quotes == 1 ) {
        if ( *tmpstr == '"' ) {
          in_quotes = 0;
        }
      } else if ( *tmpstr == '>' ) {
        in_angle = 0;
      } else if ( *tmpstr == '"') {
        in_quotes = 1;
      } else if ( isupper(*tmpstr) ) {
        *tmpstr = tolower(*tmpstr);
      } else if ( *tmpstr == '\n' ) {
        *tmpstr = ' ';
      }
    } else if ( *tmpstr == '<' ) {
      in_angle = 1;
    }
    i++;
    tmpstr++;
  }
}

parse_control(char *input_buf, struct url_link *parent_urll)
{
  char *tmpstr;
  struct url_link *urll;

  tmpstr = strtok(input_buf, CONTROL_TOKENS);
  while ( tmpstr != NULL ) {
    urll = NULL;
    if ( strncmp( tmpstr, "href", 4) == 0 ) {
      tmpstr = strtok(NULL, CONTROL_TOKENS_PLUS);
      if ( tmpstr != NULL ) {
        memset( CBuf, 0, MAX_MEDIUM_BUF);
        strncpy( CBuf, tmpstr, MAX_MEDIUM_BUF-1);
        urll = add_url_link(CBuf,parent_urll);
      }
    } 
    else if ( strncmp( tmpstr, "location.href", 13) == 0 ) {
      tmpstr = strtok(NULL, CONTROL_TOKENS_PLUS);
      if ( tmpstr != NULL ) {
        memset( CBuf, 0, MAX_MEDIUM_BUF);
        strncpy( CBuf, tmpstr, MAX_MEDIUM_BUF-1);
        urll = add_url_link(CBuf,parent_urll);
      }
    } 
    else if ( strncmp( tmpstr, "frame", 5) == 0 ) {
      tmpstr = strtok(NULL, CONTROL_TOKENS);
      while( tmpstr != NULL ) {
        if ( strncmp( tmpstr, "src", 3) == 0 ) {
          tmpstr = strtok(NULL, CONTROL_TOKENS_PLUS);
          if ( tmpstr != NULL ) {
            memset( CBuf, 0, MAX_MEDIUM_BUF);
            strncpy( CBuf, tmpstr, MAX_MEDIUM_BUF-1);
            urll = add_url_link(CBuf,parent_urll);
            urll->url_type = LOCAL_HTML;
            add_child_url(urll, parent_urll);
          }
        }
        tmpstr = strtok(NULL, CONTROL_TOKENS);
      }
    } 
    else if (strncmp (tmpstr, "img", 3) == 0 ) {
      tmpstr = strtok(NULL, CONTROL_TOKENS);
      if ( tmpstr != NULL ) {
        if ( strncmp( tmpstr, "src", 3) == 0 ) {
          tmpstr = strtok(NULL, CONTROL_TOKENS_PLUS);
          if ( tmpstr != NULL ) {
            memset( CBuf, 0, MAX_MEDIUM_BUF);
            strncpy( CBuf, tmpstr, MAX_MEDIUM_BUF-1);
            urll = add_url_link(CBuf,parent_urll);
            urll->dont_follow = 1;
            urll->url_type = LOCAL_PIC;
            add_child_url(urll, parent_urll);
          }
        }
      }
    } 
    else if ( strncmp( tmpstr, "background", 10) == 0 ) {
      tmpstr = strtok(NULL, CONTROL_TOKENS_PLUS);
      if ( tmpstr != NULL ) {
        memset( CBuf, 0, MAX_MEDIUM_BUF);
        strncpy( CBuf, tmpstr, MAX_MEDIUM_BUF-1);
        urll = add_url_link(CBuf,parent_urll);
        urll->dont_follow = 1;
        urll->url_type = LOCAL_PIC;
        add_child_url(urll, parent_urll);
      }
    }
    if ( urll != NULL ) {
      clean_up_url(urll);
    }
    tmpstr = strtok(NULL, CONTROL_TOKENS);
  }
}

check_pic(urll)
url_link *urll;
{
  if ( verbose == 1 ) {
    printf("getting pic  %-30s %s\n", urll->site, urll->full_path);
  }
  if ( http_read_url(urll, 0) > 0 ) {
    set_return_code( urll, TmpBuf);
    set_page_date(urll);
  }
}

full_out_url(urll)
 struct url_link *urll;
{
 char *tmpstr;
 int   i,j,k;
 int   count;
 int   len;
 int   found_dot;
 int   last_slash;


	process_dir_slash(urll->url, MAX_SMALL_BUF);
	process_index_tag(urll->url, MAX_SMALL_BUF);

	/* full path */
	if ( urll->url[0] == '/' ) {
		setup_standard(urll);

	/* http url */
	} else if ( strstr( urll->url, "http://" ) != NULL ) {
		setup_double_slash(urll);
		if ( strncmp( TheSite, urll->site, MAX_SMALL_BUF ) == 0 || 
			 strncmp( TheSiteAddr, urll->site, MAX_SMALL_BUF) == 0 ) {
			urll->url_type = LOCAL_HTML;
		} else {
			urll->url_type = REMOTE_URL;
		}

	} else if ( strstr( urll->url, "https://" ) != NULL ) {
		setup_double_slash(urll);
		urll->url_type = HTTPS_URL;

	} else if ( strstr( urll->url, "ftp://" ) != NULL ) {
		setup_double_slash(urll);
		urll->url_type = FTP_URL;

	} else if ( strstart( urll->url, "mailto:" ) == urll->url ) {
		setup_standard(urll);
		urll->url_type = MAILTO_URL;
		return;

	} else if ( strstart( urll->url, "file://" ) == urll->url ) {
		setup_double_slash(urll);
		urll->url_type = FILE_URL;
		return;

	} else if ( strstart( urll->url, "gopher://" ) == urll->url ) {
		setup_double_slash(urll);
		urll->url_type = GOPHER_URL;
		return;

	} else if ( strstart( urll->url, "telnet://" ) == urll->url ) {
		setup_double_slash(urll);
		urll->url_type = TELNET_URL;
		return;

	} else if ( strstart( urll->url, "wais://" ) == urll->url ) {
		setup_double_slash(urll);
		urll->url_type = WAIS_URL;
		return;

	} else if ( strstart( urll->url, "news:" ) == urll->url ) {
		setup_standard(urll);
		urll->url_type = NEWS_URL;
		return;

	} else if ( strstart( urll->url, "powwow:" ) == urll->url ) {
		setup_standard(urll);
		urll->url_type = POWWOW_URL;
		return;

	} else if ( strstr( urll->url, "http:" ) != NULL ) {
		setup_single_colon(urll);
		urll->url_type = LOCAL_HTML;

	/* relative path */
	} else {
		sprintf(urll->full_path, "%s%s", CurDir, urll->url);
		memset(urll->site, 0, MAX_SMALL_BUF);
		strncpy( urll->site, TheSite, MAX_SMALL_BUF-1);
	}

	/* Find the directory
	 * first we count the number of slashes, more than one means
	 * that we are in a different directory 
	 */
	len = strlen(urll->full_path);
	for(count=0;count<len;++count){
		if ( urll->full_path[count] == '/' ) { 
			last_slash = count;
		}
	}

	if ( last_slash == 0 ) {
		memset(urll->directory, 0, MAX_SMALL_BUF);
		strncpy( urll->directory, "/", MAX_SMALL_BUF-1);
	} else {
		for(i=0;i<=last_slash;++i){
			urll->directory[i] = urll->full_path[i];
		}
		urll->directory[i] = 0; 
	}

	process_up_dir(urll->full_path, MAX_SMALL_BUF);
	process_up_dir(urll->directory, MAX_SMALL_BUF);

	process_double_slash(urll->full_path, MAX_SMALL_BUF);
	process_double_slash(urll->directory, MAX_SMALL_BUF);

}

copy_full_url(urld, urls)
 struct url_link *urld;
 struct url_link *urls;
{
	memset(urld, 0, sizeof(url_link)); 

	strncpy( urld->url,       urls->url, MAX_SMALL_BUF-1);
	strncpy( urld->site,      urls->site, MAX_SMALL_BUF-1);
	strncpy( urld->full_path, urls->full_path, MAX_SMALL_BUF-1);
	strncpy( urld->directory, urls->directory, MAX_SMALL_BUF-1);

	urld->url_type    = urls->url_type;
	urld->dont_follow = urls->dont_follow;
	urld->used        = urls->used;
	urld->count       = urls->count;
	urld->checked     = urls->checked;
	urld->paged       = urls->paged;
	urld->return_code = urls->return_code;
}


process_up_dir(char *path, int len)
{
  int i, j;
  static char tmpbuf[MAX_SMALL_BUF];
  int safe_len;

  safe_len = MAX_SMALL_BUF;
  if ( safe_len > len ) {
    safe_len = len;
  }

  /* strip out .. */
  for(j=0,i=0;path[i] != 0 && i < safe_len; ++i, ++j ) {
    if( path[i]=='.' && path[i+1]=='.' ) {
      i+=2;
      if ( j > 0 ) {
        j-=2;
        while( j>0 && tmpbuf[j] != '/' ) {
          j--;
        }
      }
    }
    tmpbuf[j] = path[i];
  }

  tmpbuf[j] = 0;
  memset(path, 0, len);
  strncpy( path, tmpbuf, safe_len-1);
}

process_double_slash(path, len)
 char *path;
 int len;
{
 int i, j;
 int safe_len;
 static char tmpbuf[MAX_SMALL_BUF];

	safe_len = MAX_SMALL_BUF;
	if ( safe_len > len ) {
		safe_len = len;
	}

	/* strip out double slashes */
	for(j=0,i=0;path[i] != 0 && i < safe_len; ++i, ++j ) {
		tmpbuf[j] = path[i];
		while( path[i] == '/' &&
			   path[i+1] == '/' ) {
			++i;
		}
	}
	tmpbuf[j] = 0;

	memset(path, 0, len);
	strncpy( path, tmpbuf, safe_len-1);
}


char *http_error_string(error_code) 
 int error_code;
{
 static char tmpbuf[MAX_SMALL_BUF];

	switch( error_code ) {
	  case HTTP_CONTINUE:
		return(ERR_HTTP_CONTINUE);
	  case HTTP_SWITCHING_PROTOCOLS:
		return(ERR_HTTP_SWITCHING_PROTOCOLS);
	  case HTTP_OK:
		return(ERR_HTTP_OK);
	  case HTTP_CREATED:
		return(ERR_HTTP_CREATED);
	  case HTTP_ACCEPTED:
		return(ERR_HTTP_ACCEPTED);
	  case HTTP_NON_AUTHORITATIVE:
		return(ERR_HTTP_NON_AUTHORITATIVE);
	  case HTTP_NO_CONTENT:
		return(ERR_HTTP_NO_CONTENT);
	  case HTTP_RESET_CONTENT:
		return(ERR_HTTP_RESET_CONTENT);
	  case HTTP_PARTIAL_CONTENT:
		return(ERR_HTTP_PARTIAL_CONTENT);
	  case HTTP_MULTIPLE_CHOICES:
		return(ERR_HTTP_MULTIPLE_CHOICES);
	  case HTTP_MOVED_PERMANENTLY:
		return(ERR_HTTP_MOVED_PERMANENTLY);
	  case HTTP_MOVED_TEMPORARILY:
		return(ERR_HTTP_MOVED_TEMPORARILY);
	  case HTTP_SEE_OTHER:
		return(ERR_HTTP_SEE_OTHER);
	  case HTTP_NOT_MODIFIED:
		return(ERR_HTTP_NOT_MODIFIED);
	  case HTTP_USE_PROXY:
		return(ERR_HTTP_USE_PROXY);
	  case HTTP_BAD_REQUEST:
		return(ERR_HTTP_BAD_REQUEST);
	  case HTTP_UNAUTHORIZED:
		return(ERR_HTTP_UNAUTHORIZED);
	  case HTTP_PAYMENT_REQUIRED:
		return(ERR_HTTP_PAYMENT_REQUIRED);
	  case HTTP_FORBIDDEN:
		return(ERR_HTTP_FORBIDDEN);
	  case HTTP_NOT_FOUND:
		return(ERR_HTTP_NOT_FOUND);
	  case HTTP_METHOD_NOT_ALLOWED:
		return(ERR_HTTP_METHOD_NOT_ALLOWED);
	  case HTTP_NOT_ACCEPTABLE:
		return(ERR_HTTP_NOT_ACCEPTABLE);
	  case HTTP_PROXY_AUTHENTICATION_REQUIRED:
		return(ERR_HTTP_PROXY_AUTHENTICATION_REQUIRED);
	  case HTTP_REQUEST_TIME_OUT:
		return(ERR_HTTP_REQUEST_TIME_OUT);
	  case HTTP_CONFLICT:
		return(ERR_HTTP_CONFLICT);
	  case HTTP_GONE:
		return(ERR_HTTP_GONE);
	  case HTTP_LENGTH_REQUIRED:
		return(ERR_HTTP_LENGTH_REQUIRED);
	  case HTTP_PRECONDITION_FAILED:
		return(ERR_HTTP_PRECONDITION_FAILED);
	  case HTTP_REQUEST_ENTITY_TOO_LARGE:
		return(ERR_HTTP_REQUEST_ENTITY_TOO_LARGE);
	  case HTTP_REQUEST_URI_TOO_LARGE:
		return(ERR_HTTP_REQUEST_URI_TOO_LARGE);
	  case HTTP_UNSUPPORTED_MEDIA_TYPE:
		return(ERR_HTTP_UNSUPPORTED_MEDIA_TYPE);
	  case HTTP_INTERNAL_SERVER_ERROR:
		return(ERR_HTTP_INTERNAL_SERVER_ERROR);
	  case HTTP_NOT_IMPLEMENTED:
		return(ERR_HTTP_NOT_IMPLEMENTED);
	  case HTTP_BAD_GATEWAY:
		return(ERR_HTTP_BAD_GATEWAY);
	  case HTTP_SERVICE_UNAVAILABLE:
		return(ERR_HTTP_SERVICE_UNAVAILABLE);
	  case HTTP_GATEWAY_TIME_OUT:
		return(ERR_HTTP_GATEWAY_TIME_OUT);
	  case HTTP_VERSION_NOT_SUPPORTED:
		return(ERR_HTTP_VERSION_NOT_SUPPORTED);
	  case HTTP_VARIANT_ALSO_VARIES:
		return(ERR_HTTP_VARIANT_ALSO_VARIES);
	  case HTTP_COULD_NOT_CONNECT:
		return(ERR_HTTP_COULD_NOT_CONNECT);
	  case HTTP_NETWORK_READ_ERROR:
		return(ERR_HTTP_NETWORK_READ_ERROR);
		break;
	  case HTTP_NO_SUCH_EMAIL_ADDR:
		return(ERR_HTTP_NO_SUCH_EMAIL_ADDR);
		break;
	  case HTTP_NOT_CHECKED:
		return(ERR_HTTP_NOT_CHECKED);
		break;
	  default:
		sprintf(tmpbuf, "unknown return code %d", error_code);
		return(tmpbuf);
	}
}

set_return_code( urll, html_text)
 struct url_link *urll;
 char *html_text;
{
 char *tmpstr;
 char return_code[4];
 int i;
 
	tmpstr = html_text;
	for(i=0;tmpstr[i]!=0;++i){
		if ( tmpstr[i] == 'H' && tmpstr[i+1] == 'T' &&
		     tmpstr[i+2] == 'T' && tmpstr[i+3] == 'P' ) {
			return_code[0] = tmpstr[9];
			return_code[1] = tmpstr[10];
			return_code[2] = tmpstr[11];
			return_code[3] = 0; 
		}
	}
	urll->return_code = atoi(return_code);
	if ( urll->return_code == 0 ) {
		printf("Bad return code!!!!\n");
		printf("%s%s %d %s\n", 
			urll->site, urll->full_path, urll->return_code, return_code);

		for(i=0;i<20;++i) {
			printf("%c", html_text[i]);
		}
		printf("\n");
	}
}

/* 
 * Return -1 on error
 *         1 on success
 */
http_read_url(urll, not_all)
 struct url_link *urll;
 int not_all;
{
 int size;
 int len;
 int total_size;

	if ( connect_to_server(urll->site) < 0 ) {
		urll->return_code = HTTP_COULD_NOT_CONNECT;
		return(-1);
	}

	if ( not_all == 1 ) {
		total_size = 400;
	} else {
		total_size = MAX_HUGE_BUF;
	}

        snprintf(
          TmpBuf, sizeof( TmpBuf ),
          "GET %s HTTP/1.0\015\012"
          "Host: %s\015\012"
          "Accept: */*\015\012"
          "%s\015\012"
          "%s\015\012\015\012", 
           urll->full_path, urll->site, USER_AGENT, REFERER 
        );

	write(sock, TmpBuf, strlen(TmpBuf));

	len = 0;
	memset( TmpBuf, 0, MAX_HUGE_BUF);

	/* make sure we check for receiving errors */
	do {
		size = read( sock, &TmpBuf[len], total_size - len - 1);
		if ( size >= 0 ) {
			len += size;
		} else { 
			urll->return_code = HTTP_NETWORK_READ_ERROR;
			close(sock);
			return(-1);
		}
	} while (size > 0 && len < total_size);

	urll->size = len;
	close(sock);
	return(1);
}

process_dir_slash(path, len )
 char *path;
 int   len;
{
 int path_len;
 int i;
 int found_dot;

	path_len = strlen(path);
	found_dot = 0;
	for(i=path_len;i>=0;--i){
		if ( path[i] == '/' || i == 0 ) {
			for(;i<path_len;++i){
				if ( path[i] == '.' ) {
					found_dot = 1;
					break;
				}
			}
			break;
		}
	}
	if ( found_dot == 0 ) {
		strcat( path, "/");
	}
}

add_child_url(child_urll, parent_urll)
 url_link *child_urll;
 url_link *parent_urll;
{
 url_child *curll;
 url_child *tcurll;

	if ( parent_urll == NULL ) {
		return;
	}

	curll = (url_child *)malloc(sizeof(url_child));
	memset(curll,0,sizeof(url_child));
	curll->child = (void *)child_urll;

	/* find a date slot or add a new one */
	if ( parent_urll->child == NULL ) {
		parent_urll->child = curll;
	} else {
		for(tcurll=parent_urll->child;tcurll->next!=NULL;tcurll=tcurll->next);
		tcurll->next = curll;
	}
}

char *comma_string_int(in_int )
 int in_int;
{
 char tmpbuf[100];
 static char tmpbuf1[100];
 char *tmpstr;
 int  num_commas;
 int newl;
 int oldl;
 int i;

	sprintf(tmpbuf, "%d", in_int);

	oldl = strlen(tmpbuf);
	num_commas = (oldl-1)/3;
	newl = oldl + num_commas;


	for(i=-1;oldl>=0;--oldl){
		tmpbuf1[newl] = tmpbuf[oldl];
		--newl;
		++i;
		if ( i == 3 ) {
			i = 0;
			tmpbuf1[newl] = ',';
			--newl;
		}
	}
	return(tmpbuf1);

}

char *time_string_int(in_int )
 int in_int;
{
 static char tmpbuf[100];
 int sec;
 int min;
 int hour;

	sec = in_int;
	min = 0;
	hour = 0;
	while( in_int > 60 ) {
		if ( in_int > 60 ) {
			in_int -= 60; 
			++min;
			if ( min > 60 ) {
				++hour;
				min = 0;
			}
		} 
		sec = in_int;
	}
	if ( hour > 0 ) {
		sprintf(tmpbuf,"%2d:%02d:%02d", hour, min, sec);
	} else if ( min > 0 ) {
		sprintf(tmpbuf,"%2d:%02d", min, sec);
	} else {
		sprintf(tmpbuf,"%2d", sec);
	}
	return(tmpbuf);

}

clean_up_url(urll)
 struct url_link *urll;
{
	/* gets set the first time thru */
	if ( TheSubDir[0] == 0 ) {
		strcpy( TheSubDir, urll->full_path);
		return;
	}
	if ( urll->url_type != LOCAL_HTML ) {
		return;
	}

	if ( strstart( urll->full_path, TheSubDir ) != urll->full_path ) {
		if ( strcmp( urll->site, TheSite ) == 0 ) {
			urll->url_type = LOCAL_SUBDIR;
		} else {
			urll->url_type = REMOTE_URL;
		}
	}
}

check_mailto(urll)
 struct url_link *urll;
{
 int i,j,k;
static int responselen;
static unsigned char *responseend;
static unsigned char *responsepos;
static char name[MAXDNAME];
int numanswers;


	if ( verbose == 1 ) {
		printf("mailto check %s\n", urll->full_path);
	}

	/* check for a good email address type */
	for(i=0;urll->full_path[i]!= 0 && urll->full_path[i] != '@'; ++i);
	if ( urll->full_path[i] != '@' || i == j) {
		urll->return_code = HTTP_NO_SUCH_EMAIL_ADDR;
		return;
	}
	++i;
	for(j=0;urll->full_path[i]!= 0 && urll->full_path[i] != '@'; ++i,++j){
		CBuf[j] = urll->full_path[i];
	}
	CBuf[j] = 0;

	/* query the net for a mail record for this site */
	responselen = res_query( CBuf, C_IN, T_MX, response.buf, sizeof(response));
	if (responselen<=0 || errno==ECONNREFUSED || h_errno==TRY_AGAIN) {
		urll->return_code = HTTP_NO_SUCH_EMAIL_ADDR;
		return;
	}

	/* if there are at least one mail hosts, success */
	if ( ntohs(response.hdr.ancount) <= 0 ) {
		urll->return_code = HTTP_NO_SUCH_EMAIL_ADDR;
		return;
	}

	urll->return_code = HTTP_OK;

}

init_all()
{
  Virgin   = 1;
  port = HTTP_PORT;
  verbose  = 0;
  CheckRemote = 1;
  CGIRun   = 0;
  MaxUrls  = 0;
  filebool = 0;
  newbool  = 0;
  memset(filename, 0, MAX_SMALL_BUF);
  memset(TheSiteAddr, 0, MAX_SMALL_BUF);
  memset(TheSite, 0, MAX_SMALL_BUF);
  memset(TheSubDir, 0, MAX_SMALL_BUF);

  res_init();
  catch_signal();

}

void interrupt_check()
{
  write_to_file();
  write( 1, "done.\n", 6 );
  exit( EXIT_SUCCESS );
}

catch_signal()
{

#ifdef HASSIGACTION
  struct sigaction sa;
  sa.sa_handler = interrupt_check();
  sa.sa_flags = 0;
  sigemptyset(&sa.sa_mask);
  sigaction(SIGINT,&sa,(struct sigaction *) 0);
#else
  signal(SIGINT,interrupt_check);
#endif
}

set_server( struct url_link *urll)
{
  char *tmpstr;
  int i;

  tmpstr = strstr( TmpBuf, "Server: " );
  if ( tmpstr != NULL ) {
    tmpstr += 8; 
    for(i=0;*tmpstr!=(char)NULL && *tmpstr!='\n' && *tmpstr!='\r'; ++i,++tmpstr){
      urll->server[i] = *tmpstr;
    }
    urll->server[i] = 0;
  }
}

set_page_date(urll)
 struct url_link *urll;
{
 char *tmpstr;
 int i;

	tmpstr = strstr( TmpBuf, "Last-modified: " );
	if ( tmpstr != NULL ) {
		tmpstr += 15; 
		for(i=0;*tmpstr!=(char)NULL && *tmpstr!='\n' && *tmpstr!='\r'; ++i,++tmpstr){
			urll->modified_date[i] = *tmpstr;
		}
		urll->modified_date[i] = 0;
	}
}

char *strstart(sstr, tstr)
 char *sstr;
 char *tstr;
{
 char *ret_str;

	ret_str = sstr;
	if ( sstr == NULL || tstr == NULL ) {
		return(NULL);
	}
	while ( *sstr != 0 && *tstr != 0 ) {
		if ( *sstr != *tstr ) {
			return(NULL);
		}
		++sstr;
		++tstr;
	}
	if ( *tstr == 0 ) {
		return(ret_str);
	}
	return(NULL);

}

set_starting( char *start_site ) 
{
 int i,j,k;
 url_link *urll;

	memset(TmpBuf, 0, MAX_HUGE_BUF);
	strncpy( TmpBuf, start_site, MAX_HUGE_BUF-1); 

	memset( CBuf, 0, MAX_MEDIUM_BUF);
	strncpy( CBuf, start_site, MAX_MEDIUM_BUF-1);

	i = 0;
	if ( strstr( TmpBuf, "http://" ) != NULL ) {
		for(;TmpBuf[i]!='/';++i);
		i+=2;
	} else {

		memset(CBuf, 0, MAX_MEDIUM_BUF);
		strncpy(CBuf, "http://", MAX_MEDIUM_BUF-1);

		strncat(CBuf, start_site, MAX_MEDIUM_BUF-1);

	}
	k = 0;
	for(j=0;TmpBuf[i]!=0&&TmpBuf[i]!='/';++i,++j){
		TheSite[j] = TmpBuf[i];
	}
	TheSite[j] = 0; 

	urll = add_url_link(CBuf,NULL);
	clean_up_url(urll);
}

char *safe_getenv(char *var)
{
 char *s;

	s = getenv(var);
	if ( s == NULL ) {
		return("");
	} 
	return(s);
}

process_index_tag(path, len )
 char *path;
 int   len;
{
 int path_len;
 int i;
 int found_dot;

	path_len = strlen(path);
	for(i=path_len;i>=0;--i){
		if ( path[i] == '#' ) {
			path[i] = 0;
			return(0);
		}
	}
}

check_ftp(urll)
 url_link *urll;
{

  if ( verbose == 1 ) {
    printf("getting ftp %-30s %s\n", urll->site, urll->full_path);
  }
  urll->return_code = HTTP_OK; 

  /*************************************
  if ( CheckRemote == 0 ) {
    urll->return_code = HTTP_OK; 
  } 
  else if ( ftp_read_url( urll) > 0 ) {
    urll->return_code = HTTP_OK; 
  }
  *************************************/
}

/* 
 * Return -1 on error
 *         1 on success
 */
ftp_read_url( urll)
 struct url_link *urll;
{
 int size;
 int len;

	if ( connect_to_ftp_server(urll->site) < 0 ) {
		urll->return_code = HTTP_COULD_NOT_CONNECT;
		return(-1);
	}

	/*
	 * read the login string
	 */
	
	/*
	 * send the ftp login
	 */

	/*
	 * read the password string
	 */

	/*
	 * send the linkcheck@inter7.com password
	 */

	/*
	 * set to binary 
	 */

	/*
	 * get the file
	 */

	/*
	sprintf( TmpBuf, "GET %s HTTP/1.0\n%s\n%s\n\n", 
		urll->full_path, USER_AGENT, REFERER);
	write(sock, TmpBuf, strlen(TmpBuf));

	len = 0;
	memset( TmpBuf, 0, MAX_HUGE_BUF);

	do {
		size = read( sock, &TmpBuf[len], MAX_HUGE_BUF - len - 1);
		if ( size >= 0 ) {
			len += size;
		} else { 
			urll->return_code = HTTP_NETWORK_READ_ERROR;
			close(sock);
			return(-1);
		}
	} while (size > 0);

	urll->size = len;
	*/

	close(sock);
	return(1);
}

connect_to_ftp_server(site)
 char *site;
{
 struct hostent *hostEntry;
 struct hostent *clientEntry;
 struct sockaddr_in internetAddr;	
 struct sockaddr_in remoteAddr;	
 int remoteLength = sizeof(remoteAddr);

	if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
		return(-1);
	}

	memset(&internetAddr, 0, sizeof(internetAddr));
	internetAddr.sin_port   = htons(FTP_PORT);
	internetAddr.sin_family = AF_INET; 

	if ((hostEntry=gethostbyname(site))!=(struct hostent *)0) {
		memcpy(&internetAddr.sin_addr, hostEntry->h_addr, hostEntry->h_length);
	} else {
		if ( (internetAddr.sin_addr.s_addr = inet_addr(site)) < 0 ) {
			return(-1);
		}
	}
	if ( TheSiteAddr[0] == 0 && strcmp(site, TheSite) == 0 ) { 
		strcpy( TheSiteAddr, (char *)inet_ntoa(internetAddr.sin_addr));
	}

	/* 
	 * connect to remote host 
	 */
	if ( connect(sock, (struct sockaddr *)&internetAddr, sizeof(internetAddr)) < 0 ) {
		close(sock);
		return(-1);
	}
	return(0);
}

check_file(urll)
 url_link *urll;
{
	urll->return_code = HTTP_OK; 
}

check_news(urll)
 url_link *urll;
{
	urll->return_code = HTTP_OK; 
}

check_gopher(urll)
 url_link *urll;
{
	urll->return_code = HTTP_OK; 
}

check_telnet(urll)
 url_link *urll;
{
	urll->return_code = HTTP_OK; 
}

check_wais(urll)
 url_link *urll;
{
	urll->return_code = HTTP_OK; 
}

check_powwow(urll)
 url_link *urll;
{
	urll->return_code = HTTP_OK; 
}

setup_standard(urll)
 url_link *urll;
{
	memset(urll->full_path, 0, MAX_SMALL_BUF);
	strncpy( urll->full_path, urll->url, MAX_SMALL_BUF-1);

	memset( urll->site, 0, MAX_SMALL_BUF);
	strncpy( urll->site, TheSite, MAX_SMALL_BUF-1);
}

setup_single_colon(urll)
 url_link *urll;
{
 char *tmpstr;
 int i;

	tmpstr = urll->url;

	/* walk past the colon */
	while(*tmpstr != ':' && *tmpstr != (char)NULL ) ++tmpstr;
	++tmpstr;
	i = 0;

	memset(urll->full_path, 0, MAX_SMALL_BUF);
	strncpy( urll->full_path, tmpstr, MAX_SMALL_BUF-1);

	memset( urll->site, 0, MAX_SMALL_BUF);
	strncpy( urll->site, TheSite, MAX_SMALL_BUF-1);

}

setup_double_slash(urll)
 url_link *urll;
{
 char *tmpstr;
 int i;

	tmpstr = urll->url;

	/* walk past the slashes */
	while(*tmpstr != '/' && *tmpstr != (char)NULL ) ++tmpstr;
	++tmpstr;
	++tmpstr;
	i = 0;

	while(*tmpstr != '/' && *tmpstr != (char)NULL && i < (MAX_SMALL_BUF-1)) {
		urll->site[i] = *tmpstr;
		++i;
		++tmpstr;
	}
	urll->site[i] = 0;

	i = 0;
	while( *tmpstr != (char)NULL && i < (MAX_SMALL_BUF-1)) {
		urll->full_path[i] =  *tmpstr;
		++tmpstr;
		++i;
	}
	if ( i == 0 ) {
		memset(urll->full_path, 0, MAX_SMALL_BUF);
		strncpy( urll->full_path, "/", MAX_SMALL_BUF-1);
	} else {
		urll->full_path[i] = 0;
	}
}
