///////
   //    HtmlParser.h
   //    HtmlParser Class declaration
   //
   //    Class for parsing of a HTML Document and for storing
   //    info into the DB.
   //
   //    Copyright (c) 1999-2004 Comune di Prato - Prato - Italy
   //    Author: Gabriele Bartolini - Prato - Italy <angusgb@users.sourceforge.net>
   //
   //    For copyright details, see the file COPYING in your distribution
   //    or the GNU General Public License version 2 or later 
   //    <http://www.gnu.org/copyleft/gpl.html>
   //
   //    $Id: HtmlParser.h,v 1.24 2004/04/26 14:12:23 angusgb Exp $
   //
   //    G.Bartolini
   //    started: 30.01.2000
///////

#ifndef _HTMLPARSER_H
#define _HTMLPARSER_H

#include "htString.h"
#include "Scheduler.h"
#include "HtmlStatement.h"
#include "HtmlAttribute.h"
#include "Link.h"
#include "_Url.h"
#include "AccessibilityCheck.h"


class HtmlParser
{
   public:

      HtmlParser();
      ~HtmlParser();

      // Enumeration of the parser codes returned by functions
      enum HtmlParser_Codes
      {
         HtmlParser_NullTag,
         HtmlParser_TagNotStored,
         HtmlParser_MalformedTag,
         HtmlParser_StatementFailed,
         HtmlParser_AttributeFailed,
         HtmlParser_AccessibilityCheckFailed,
         HtmlParser_NoLink,
         HtmlParser_NormalLink,
         HtmlParser_DirectLink,
         HtmlParser_Anchor,
         HtmlParser_LinkFailed,
         HtmlParser_OK,
      };


      HtmlParser_Codes operator() ( Scheduler &scheduler );
      
      // Static methods for managing debug level
      static void SetDebugLevel (int d) { debug=d;}

   protected:

   ///////
      // Protected Functions
   ///////

      HtmlParser_Codes ParseTag();           // Parse a HTML statement
      int CheckTag(char *tag);   // Check if a tag has to be stored
      HtmlParser_Codes FindLink();           // Find a link
      const String encodeSGML(const String &str);
      const String decodeSGML(const String &str);
      
   ///////
      // Protected Attributes
   ///////

      // Scheduler Object for getting/putting info from/into
      // memory and DB
      Scheduler *CurrentScheduler;

      // Base Url used for resolving relative paths
      _Url *BaseUrl;

      // Temporary buffer for tags storage
      unsigned char text[8192];
      
      // position is set to the beginning of the retrieved document contents
      unsigned char *position;

      // Temporary cursor for source string (contents)
      unsigned char *ppos;

      // Temporary cursor for destination string (text -> tags)
      unsigned char *ptext;
      
      // Counter of document tags
      unsigned int TagPosition;

      // Row number
      unsigned int row;

      // Last tag with a link
      unsigned int LastLinkTagPosition;

      // Temporary Object for HtmlStatement storing
      HtmlStatement htmlstatement;

      // Temporary Object for HtmlAttribute storing
      HtmlAttribute htmlattribute;

      // Temporary Object for Link storing
      Link link;

      // Temporary String for Charset specification
      String Charset;

      // Temporary String for DocType specification
      String DocType;

      // HTML Description of a link (<A href="uri">description</a>)
      String LinkDescription;

      // Temporary String for Description
      String Description;

      // Temporary String for Keywords
      String Keywords;

      // HTML document language (HTML lang="xx(x)" according to ISO 639)
      String DocLanguage;

      // Current header level
      unsigned int CurrentHx;

      // Previous header level
      unsigned int PreviousHx;

	  // Current alternative text
	  String CurrentAltText;

	  // Current resource reference
	  String CurrentResourceRef;

      // Previous ALT attribute position
      unsigned int AltAttrPosition;

///////
   //    Internal flags
///////

   bool ignore;      // if true we ignore the tags
   bool memo;        // Has the tag to be stored? true=yes
   int tag_type;    // type of the tag (start, end, empty)
   int location;    // location in the document (script, title, link, etc.)
   int doc_acheck; // accessibility check info (document level)
   bool store_statement; // should we store the statement?

///////
   //    Static attributes
///////

      static int debug;    // Run-time debugging level

      // Encode an URL
      static void encodeURL(String &str, char *valid = ";/?:@&=+$,");

	  // Insert an accessibility check record into the database
	  bool InsertAccessibilityCheck(unsigned int idurl, unsigned int tagposition,
         unsigned int attrposition, unsigned int code);

	  // Returns the length of a string (skipping consecutive spaces)
      unsigned CountSGMLStringLength(const char* str);

	  // Returns an integer with results of a check regarding an ALT text
      unsigned CheckAlt();
};

#endif
