///////
   //
   //    HtDefault.cc
   //
   //    default values for ht://Check
   //
   //    Copyright (c) 1999-2002 Comune di Prato - Prato - Italy
   //    Author: Gabriele Bartolini - Prato - Italy <angusgb@users.sourceforge.net>
   //
   //    For copyright details, see the file COPYING in your distribution
   //    or the GNU General Public License version 2 or later 
   //    <http://www.gnu.org/copyleft/gpl.html>
   //
   //    $Id: HtDefaults.cc,v 1.17 2002/02/06 11:04:34 angusgb Exp $
   //
///////
   
#if RELEASE
static char RCSid[] = "$Id: HtDefaults.cc,v 1.17 2002/02/06 11:04:34 angusgb Exp $";
#endif

#include "Configuration.h"

ConfigDefaults	defaults[] =
{

{ "accept_language", "", 
	"string", "htcheck", "accept_language: en-us en it", "
   This attribute allows to restrict the set of natural languages that are
   preferred as a response to an HTTP request performed by the digger. This can be
   done by putting one or more language tags (as defined by RFC 1766) in the
   preferred order, separated by spaces. By doing this, when the server performs a
   content negotiation based on the 'accept-language' given by the HTTP user agent,
   a different content can be shown depending on the value of this attribute. If
   set empty, no language will be sent and the server default will be returned.
" },

{ "bad_extensions", "", 
	"string", "htcheck", "bad_extensions: .foo .bar .bad", "
	This is a list of extensions on URLs which are
	considered non-parsable. This list is used mainly to
	supplement the MIME-types that the HTTP server provides
	with documents. Some HTTP servers do not have a correct
	list of MIME-types and so can advertise certain
	documents as text while they are some binary format.
" },
{ "bad_querystr", "", 
	"string", "htcheck", "bad_querystr: forum=private section=topsecret&amp;passwd=required", "
	This is a list of CGI query strings to be excluded from
	indexing. This can be used in conjunction with CGI-generated
	portions of a website to control which pages are
	indexed.
" },
{"db_name", DB_NAME,
	"string", "htcheck", "db_name: test", "
   Name of the MySQL database to be created or read.
" },

{ "check_external", "true", 
	"boolean", "htcheck", "check_external: false", "
   If set to 'true', htcheck check if external Urls exist or not.
   An external Url is an Url which doesn't match limit configuration
   attributes. External URLs aren't parsed.
" },

{ "disable_cookies", "false", 
	"boolean", "htcheck", "disable_cookies: true", "
   If set to 'true', htcheck will disable the HTTP cookies management.
" },

{ "store_only_links", "true", 
	"boolean", "htcheck", "store_only_links: false", "
      If set to false, htcheck will store in the DB <every> tag he finds
      in every document he crawls.
      If set to true, htcheck stores only those Html attributes and statements
      that produce a link or set an anchor
      (identified by the pair tag: A, attribute: name).
" },
{ "optimize_db", "false", 
	"boolean", "htcheck", "optimize_db: true", "
        Optimize the database tables at the end of the crawl. Disable it if
        the database server doesn't support it.
" },
{ "persistent_connections", "true", 
	"boolean", "htcheck", "persistent_connections: false", "
	If set to true, when servers make it possible, htdig can take advantage
        of persistent connections, as defined by HTTP/1.1 (<I>RFC2616</I>). This permits
        to reduce the number of open/close operations of connections, when retrieving
        a document with HTTP.
" },

{ "head_before_get", "false", 
	"boolean", "htcheck", "head_before_get: true", "
        This option works only if we take advantage of persistent connections (see
        persistent_connections attribute). If set to true an HTTP/1.1 <I>HEAD</I>
        call is made in order to retrieve header information about a document.
        If the status code and the content-type returned let the document be parsable,
        then a following 'GET' call is made.
" },
{ "http_proxy", "", 
	"string", "htcheck", "http_proxy: http://proxy.bigbucks.com:3128", "
	When this attribute is set, all HTTP document
	retrievals will be done using the HTTP-PROXY protocol.
	The URL specified in this attribute points to the host
	and port where the proxy server resides.<br>
	The use of a proxy server greatly improves performance
	of the indexing process.
" },
{ "http_proxy_exclude", "", 
	"string", "htcheck", "http_proxy_exclude: http://intranet.foo.com/", "
	When this is set, URLs matching this will not use the
	proxy. This is useful when you have a mixture of sites
	near to the digging server and far away.
" },
{ "max_doc_size", "100000", 
	"number", "htcheck", "max_doc_size: 5000000", "
	This is the upper limit to the amount of data retrieved
	for documents. This is mainly used to prevent
	unreasonable memory consumption since each document
	will be read into memory by htcheck.
" },
{ "max_retries", "3", 
	"number", "htcheck", "max_retries: 6", "
         This option set the maximum number of retries when retrieving a document
         fails (mainly for reasons of connection).
" },


{ "sql_big_table_option", "true", 
	"boolean", "htcheck", "sql_big_table_option: false", "
        Enable or disable this option that is useful when performing huge queries.
        Otherwise, sometimes when it's not set, the MySQL db server may return
        a 'table is full' error.
" },

{ "start_url", "http://htcheck.sourceforge.net/", 
	"string", "htcheck", "start_url: http://www.somewhere.org/alldata/index.html", "
	This is the list of URLs that will be used to start a
	dig when there was no existing database. Note that
	multiple URLs can be given here.
" },

{ "summary_anchor_not_found", "true", 
	"boolean", "htcheck", "summary_anchor_not_found: false", "
        Enable or disable the show of the summary of the HTML anchors that
        have not been found.
" },

{ "tcp_max_retries", "1", 
	"number", "htcheck", "tcp_max_retries: 6", "
         This option set the maximum number of attempts when a connection
         <A href=\"#timeout\">timeout</A>s.
         After all these retries, the connection attempt results <timed out>.
" },
{ "tcp_wait_time", "5", 
	"number", "htcheck", "tcp_wait_time: 10", "
         This attribute sets the wait time after a connection fails and the
         <A href=\"#timeout\">timeout</A> is raised.
" },

{ "timeout", "30", 
	"number", "htcheck", "timeout: 42", "
	Specifies the time the digger will wait to complete a
	network read. This is just a safeguard against
	unforeseen things like the all too common
	transformation from a network to a notwork.<br>
	The timeout is specified in seconds.
" },
{ "max_hop_count", "999999", 
	"number", "htcheck", "max_hop_count: 4", "
	Instead of limiting the indexing process by URL
	pattern, it can also be limited by the number of hops
	or clicks a document is removed from the starting URL.
	The starting page will have hop count 0.
" },
{"mysql_conf_file_prefix", "my",
	"string", "htcheck", "mysql_conf_file_prefix: htcheck", "
      Prefix for the MySQL configuration file to be searched. Default is 'my' and
      The file searched is usually '~/.my.cnf' (suggested). If it is not found
      the /etc/.my.cnf file is searched. For its syntax, look at 'Option File'
      contents inside the MySQL documentation.
" },
{"mysql_conf_group", "client",
	"string", "htcheck", "mysql_conf_group: htcheck", "
      Group to be searched inside the .my.cnf file of MySQL for getting the
      settings for the connection to the server. In other words, it's the
      section marked with [<group>] inside the MySQL option file (default
      is [client]).      
" },
{ "url_index_length", "64", 
	"number", "htcheck", "url_index_length: -1", "
      This number specifies the length of the index of the
      Url field in the Schedule and Url tables of the database.
      You can set different values depending on the average
      length of the URLs that htcheck can find in your
      sites. If you don't want to set any limitation, just
      put a '-1' value.
      This now allows the user to control the length of the index
      for the Url field in the Schedule and Url tables. This attribute
      may affect the performance of the crawls, as long as the length
      of an index can either slow down or speed up the spidering process.
        
" },
{ "user_agent", "ht://check", 
	"string", "htcheck", "user_agent: htcheck-crawler", "
	This allows customization of the user_agent: field sent when
	the digger requests a file from a server.
" },
{ "exclude_urls", "", 
	"string", "htcheck", "exclude_urls: students.html cgi-bin", "
	If a URL contains any of the space separated patterns,
	it will be rejected. This is used to exclude such
	common things such as an infinite virtual web-tree
	which start with cgi-bin.
" },
{ "limit_normalized", "", 
	"string", "htcheck", "limit_normalized: http://www.mydomain.com", "
	This specifies a set of patterns that all URLs have to
	match against in order for them to be included in the
	search. Unlike the limit_urls_to directive, this is done
	<b>after</b> the URL is normalized and the server_aliases
	directive is applied. This allows filtering after any
	hostnames and DNS aliases are resolved. Otherwise, this
	directive is the same as the <a
	href=\"#limit_urls_to\">limit_urls_to</a> directive.
" },
{ "limit_urls_to", "${start_url}", 
	"string", "htcheck", "limit_urls_to: .sdsu.edu kpbs", "
	This specifies a set of patterns that all URLs have to
	match against in order for them to be included in the
	search. Any number of strings can be specified,
	separated by spaces. If multiple patterns are given, at
	least one of the patterns has to match the URL.<br>
	Matching is a case-insensitive string match on the URL
	to be used. The match will be performed <em>after</em>
	the relative references have been converted to a valid
	URL. This means that the URL will <em>always</em> start
	with <tt>http://</tt>.<br>
	Granted, this is not the perfect way of doing this,
	but it is simple enough and it covers most cases.
" },
{ "authorization", "",
        "string", "htcheck", "authorization: myusername:mypassword", "
	This tells htcheck to send the supplied
        <em>username</em><strong>:</strong><em>password</em> with each HTTP request.
        The credentials will be encoded using the \"Basic\" authentication
        scheme. There <em>must</em> be a colon (:) between the username and
        password.<br>
" },

   {0, 0, 0, 0, 0, 0},

};

Configuration	config;
