// QWeb - An SGML Web Browser
// Copyright (C) 1997  Sean Vyain
// svyain@mail.tds.net
// smvyain@softart.com
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
extern "C"
{
#include <stdio.h>
#include <values.h>
#include <ctype.h>
#include <strings.h>
}
#include "SgmlLexer.h"

static SgmlLexer::Table lextable[] = {
    { 8, "ENTITIES", SgmlLexer::Entities              },
    { 8, "NMTOKENS", SgmlLexer::Nmtokens              },
    { 8, "NOTATION", SgmlLexer::Notation              },
    { 8, "NUTOKENS", SgmlLexer::Nutokens              },
    { 8, "REQUIRED", SgmlLexer::Required              },
    { 8, "SHORTREF", SgmlLexer::Shortref              },
    { 8, "STARTTAG", SgmlLexer::Starttag              },
    { 7, "ATTLIST" , SgmlLexer::Attlist               },
    { 7, "CURRENT" , SgmlLexer::Current               },
    { 7, "DOCTYPE" , SgmlLexer::Doctype               }, 
    { 7, "ELEMENT" , SgmlLexer::Element               },
    { 7, "IMPLIED" , SgmlLexer::Implied               },
    { 7, "INCLUDE" , SgmlLexer::Include               },
    { 7, "NMTOKEN" , SgmlLexer::Nmtoken               },
    { 7, "NUMBERS" , SgmlLexer::Numbers               },
    { 7, "NUTOKEN" , SgmlLexer::Nutoken               },
    { 7, "USELINK" , SgmlLexer::Uselink               },
    { 6, "CONREF"  , SgmlLexer::Conref                },
    { 6, "ENDTAG"  , SgmlLexer::Endtag                },
    { 6, "ENTITY"  , SgmlLexer::Entity                },
    { 6, "IDREFS"  , SgmlLexer::Idrefs                },
    { 6, "IGNORE"  , SgmlLexer::Ignore                },
    { 6, "NUMBER"  , SgmlLexer::Number                },
    { 6, "PCDATA"  , SgmlLexer::Pcdata                },
    { 6, "PUBLIC"  , SgmlLexer::Public                },
    { 6, "RCDATA"  , SgmlLexer::Rcdata                },
    { 6, "SIMPLE"  , SgmlLexer::Simple                },
    { 6, "SUBDOC"  , SgmlLexer::Subdoc                },
    { 6, "SYSTEM"  , SgmlLexer::System                },
    { 6, "USEMAP"  , SgmlLexer::Usemap                },
    { 5, "CDATA"   , SgmlLexer::Cdata                 },
    { 5, "EMPTY"   , SgmlLexer::Empty                 },
    { 5, "FIXED"   , SgmlLexer::Fixed                 },
    { 5, "IDREF"   , SgmlLexer::Idref                 },
    { 5, "NAMES"   , SgmlLexer::Names                 },
    { 5, "NDATA"   , SgmlLexer::Ndata                 },
    { 5, "SDATA"   , SgmlLexer::Sdata                 },
    { 4, "NAME"    , SgmlLexer::Name                  },
    { 4, "TEMP"    , SgmlLexer::Temp                  },
    { 3, "ANY"     , SgmlLexer::Any                   },
    { 2, "ID"      , SgmlLexer::Id                    },
    { 2, "MD"      , SgmlLexer::Md                    },
    { 2, "MS"      , SgmlLexer::Ms                    },
    { 2, "PI"      , SgmlLexer::Pi                    },
    { 1, "O"       , SgmlLexer::OptionalTag           },
    { 0, 0, SgmlLexer::NullToken }
};

//=============================================================================
// Public methods.
//-----------------------------------------------------------------------------
SgmlLexer::SgmlLexer()
        : _dtd( 0 ),
          _buf( new char[1025] ),
          _bufSize( 1025 ),
          _bufStart( 0 ),
          _bufEnd( 0 ),
          _token( new char[1024] ),
          _tokenSize( 1024 ),
          _mode( PcdataMode ),
          _done( FALSE )
{
}

SgmlLexer::~SgmlLexer()
{
    delete [] _buf;
    delete [] _token;
}

//=============================================================================
// Public slots.
//-----------------------------------------------------------------------------
void SgmlLexer::data( const char* bytes, int length )
{
    bool    isKeyword;
    char    ch1;
    char    ch2;
    int     idx1;
    int     idx2;

    // Move leftover data to beginning of buffer.
    if ( _bufStart > 0 ) {
        for ( int i = 0; i <= ( _bufEnd - _bufStart ); i++ ) {
            _buf[i] = _buf[i + _bufStart];
        }
        _bufEnd -= _bufStart;
        _bufStart = 0;
    }
//    printf( "SgmlLexer::data() -- _bufSize = %d, _bufStart = %d, _bufEnd = %d, _buf[%d] = %d\n", _bufSize, _bufStart, _bufEnd, _bufEnd, _buf[_bufEnd] );
    
    // Append incoming data to the buffer.
    if ( _bufEnd + length + 1 > _bufSize ) {
        // Resize the buffer.
        _bufSize = _bufEnd + length + 1;
//        printf( "SgmlLexer::data() -- _bufSize = %d\n", _bufSize );
        char* newBuf = new char[ _bufSize ];
        for ( int i = 0; i <= _bufEnd; i++ ) {
            newBuf[i] = _buf[i];
        }
//        printf( "SgmlLexer::data() -- deleting _buf\n" );
        delete [] _buf;
        _buf = newBuf;
    }
//    printf( "SgmlLexer::data() -- _bufSize = %d, _bufStart = %d, _bufEnd = %d\n", _bufSize, _bufStart, _bufEnd );

    for ( int i = 0; i < length; i++ ) {
        _buf[i + _bufEnd] = bytes[i];
    }
    _bufEnd += length;
    _buf[_bufEnd] = '\0';
//    printf( "SgmlLexer::data() -- _bufSize = %d, _bufStart = %d, _bufEnd = %d\n", _bufSize, _bufStart, _bufEnd );

//    for ( int i = 0; i <= _bufEnd; i++ ) {
//        printf( "SgmlLexer::data() -- _buf[%d] = %d\n", i, _buf[i] );
//    }

    // Look for tokens.
    while ( _bufStart < _bufEnd ) {
        int length = _bufEnd - _bufStart;
//        printf( "SgmlLexer::data() -- _bufSize = %d, _bufStart = %d, _bufEnd = %d, _buf = '%s'\n", _bufSize, _bufStart, _bufEnd, _buf + _bufStart );
        
        switch ( _mode ) {
            case CdataMode:
                // Only the end tag open delimeter is recognized.
                for ( idx1 = _bufStart; idx1 < ( _bufEnd - 1 ); idx1++ ) {
                    if ( ( _buf[idx1] == '<' ) && ( _buf[idx1+1] == '/' ) ) break;
                }
                if ( idx1 == _bufStart ) {
                    doToken( EndTagOpen, _bufStart+2 );
                } else if ( idx1 > _bufStart ) {
                    doToken( Content, idx1 );
                } else if ( _done ) {
                    doToken( Content, _bufEnd );
                }
                break;

            case RcdataMode:
                printf( "SgmlLexer::data() -- RCDATA mode is not implemented!!!\n" );
                break;

            case PcdataMode:
                // Recognize all delimeters.
                ch1 = _buf[_bufStart];
                if ( length > 1 ) {
                    ch2 = _buf[_bufStart+1];
                } else {
                    ch2 = 0;
                }
                
                if ( ( ch1 == '<' ) || ( ch1 == '&' ) ) {
                    if ( ch1 == '<' ) {
                        if ( ch2 == '!' ) {
                            doToken( MarkupDeclOpen, _bufStart+2 );
                        } else if ( ch2 == '/' ) {
                            doToken( EndTagOpen, _bufStart+2 );
                        } else if ( ch2 == '?' ) {
                            doToken( ProcInstOpen, _bufStart+2 );
                        } else if ( ( ch2 ) || ( _done ) ) {
                            doToken( StartTagOpen, _bufStart+1 );
                        } else {
                            // Need more data.
                            return;
                        }
                    } else {
                        if ( ch2 == '#' ) {
                            // Charater reference.
                            idx1 = _bufStart+2;
                        } else {
                            // Entity reference.
                            idx1 = _bufStart+1;
                        }

                        for ( idx2 = idx1; ( idx2 < _bufEnd ) && ( isalnum( _buf[idx2] ) ); idx2++ );

                        if ( ( idx2 < _bufEnd ) || ( _done ) ) {
                            _bufStart = idx2;
                            if ( ( _bufStart < _bufEnd ) && ( _buf[_bufStart] == ';' ) ) {
                                _bufStart++;
                            }
                            if ( ch2 == '#' ) {
                                // Character reference.
                                QString num;
                                for ( ; idx1 < idx2; idx1++ ) num += _buf[idx1];
                                _buf[--_bufStart] = num.toInt();
                                doToken( Content, _bufStart+1 );
                            } else {
                                QString entity;
                                for ( ; idx1 < idx2; idx1++ ) entity += _buf[idx1];
                                QString text;
                                if ( ( _dtd ) && ( _dtd->getEntity( entity, text ) ) ) {
                                    int textLength = text.length();
                                    int length = _bufEnd - _bufStart;
                                    if ( _bufSize < ( textLength + length + 1 ) ) {
                                        // Resize buffer.
                                        _bufSize = textLength + length + 1;
                                        char* newBuf = new char[_bufSize];
                                        for ( int i = 0; i <= length; i++ ) {
                                            newBuf[_bufSize - length - 1 + i] = _buf[_bufStart + i];
                                        }
                                        delete [] _buf;
                                        _buf = newBuf;
                                        _bufStart = _bufSize - length - 1;
                                        _bufEnd = _bufSize - 1;
                                    }
                                    if ( textLength >= _bufStart ) {
                                        // Move buffer text to end of buffer.
                                        for ( int i = length; i >= 0; i-- ) {
                                            _buf[_bufSize - length - 1 + i] = _buf[_bufStart + i];
                                        }
                                        _bufStart = _bufSize - length - 1;
                                        _bufEnd = _bufSize - 1;
                                    }
                                    
                                    // Prepend the buffer with entity text.
                                    for ( int i = 0; i < textLength; i++ ) {
                                        _buf[_bufStart - textLength + i] = text[i];
                                    }
                                    _bufStart -= textLength;
                                }
                            }
                        } else {
                            // Need more data.
                            return;
                        }
                    }
                } else {
                    for ( idx1 = _bufStart; ( idx1 < _bufEnd ) && ( _buf[idx1] != '<' ); idx1++ );
                    for ( idx2 = _bufStart; ( idx2 < _bufEnd ) && ( _buf[idx2] != '&' ); idx2++ );
                    if ( idx1 < _bufEnd ) {
                        if ( idx1 < idx2 ) {
                            doToken( Content, idx1 );
                        } else {
                            doToken( Content, idx2 );
                        }
                    } else if ( idx2 < _bufEnd ) {
                        doToken( Content, idx2 );
                    } else {
                        // It's all content.
                        doToken( Content, _bufEnd );
                    }
                }
                break;

            case MarkupMode:
            {
                while ( isspace( _buf[_bufStart] ) ) {
                    _bufStart++;
                }

                if ( isdigit( _buf[_bufStart] ) ) {
                    for ( idx1 = _bufStart; ( idx1 < _bufEnd ) && ( isdigit( _buf[idx1] ) || ( _buf[idx1] == '%' ) ); idx1++ );
                    if ( ( idx1 >= _bufEnd ) && ( !_done ) ) {
                        // Cannot find end of token (yet).
                        return;
                    }
                    doToken( Integer, idx1 );
                } else if ( ( isalpha( _buf[_bufStart] ) ) || ( _buf[_bufStart] == '/' ) || ( _buf[_bufStart] == '!' ) ) {
                    for ( idx1 = _bufStart; ( idx1 < _bufEnd ) && ( isalnum( _buf[idx1] ) || ( _buf[idx1] == '.' ) || ( _buf[idx1] == '-' ) || ( _buf[idx1] == '/' ) || ( _buf[idx1] == '!' ) || ( _buf[idx1] == ':' ) || ( _buf[idx1] == '_' ) || ( _buf[idx1] == '@' ) /*|| ( _buf[idx1] == '?' )*/ ); idx1++ );
                    if ( ( idx1 >= _bufEnd ) && ( !_done ) ) {
                        // Cannot find end of token (yet).
                        return;
                    }
                    
                    // See if we recognize this string.
                    int len = idx1 - _bufStart;
                    isKeyword = FALSE;
                    for ( int i = 0; lextable[i].text; i++ ) {
                        if ( ( len == lextable[i].length ) && ( !strncasecmp( _buf+_bufStart, lextable[i].text, len ) ) ) {
                            doToken( lextable[i].token, idx1 );
                            isKeyword = TRUE;
                            continue;
                        }
                    }

                    // It must be an identifier if we're here.
                    if ( !isKeyword ) {
                        doToken( Identifier, idx1 );
                    }
                } else {
                    ch1 = _buf[_bufStart];
                    if ( length > 1 ) {
                        ch2 = _buf[_bufStart+1];
                    } else {
                        ch2 = 0;
                    }
                    
                    if ( ch1 == ',' ) {
                        doToken( Comma, _bufStart+1 );
                    } else if ( ch1 == '&' ) {
                        doToken( Ampersand, _bufStart+1 );
                    } else if ( ch1 == '?' ) {
                        doToken( QuestionMark, _bufStart+1 );
                    } else if ( ch1 == '+' ) {
                        if ( ch2 == '(' ) {
                            doToken( InclusionListStart, _bufStart+2 );
                        } else if ( ( ch2 ) || ( _done ) ) {
                            doToken( PlusSign, _bufStart+1 );
                        }
                    } else if ( ch1 == '#' ) {
                        doToken( ReservedNameIndicator, _bufStart+1 );
                    } else if ( ch1 == '*' ) {
                        doToken( Star, _bufStart+1 );
                    } else if ( ch1 == '|' ) {
                        doToken( OrOperator, _bufStart+1 );
                    } else if ( ch1 == ';' ) {
                        doToken( ReferenceClose, _bufStart+1 );
                    } else if ( ch1 == '(' ) {
                        doToken( GroupOpen, _bufStart+1 );
                    } else if ( ch1 == ')' ) {
                        doToken( GroupClose, _bufStart+1 );
                    } else if ( ch1 == '[' ) {
                        doToken( DeclSubsetOpen, _bufStart+1 );
                    } else if ( ch1 == ']' ) {
                        doToken( DeclSubsetClose, _bufStart+1 );
                    } else if ( ch1 == '%' ) {
                        if ( ch2 != ' ' ) {
                            // Parameter entity reference open.
                            QString entity;
                            ch1 = _buf[_bufStart+1];
                            for ( idx1 = _bufStart+2; ( idx1 < _bufEnd ) && ( ( isalpha( ch1 ) ) || ( ch1 == '.' ) || ( ch1 == '-' ) ); idx1++ ) {
                                entity += ch1;
                                ch1 = _buf[idx1];
                            }

                            if ( idx1 < _bufEnd ) {
                                // Process entity reference.
                                _bufStart = idx1 - 1;
                                if ( _buf[_bufStart] == ';' ) {
                                    _bufStart++;
                                }
//                                printf( "SgmlLexer::data() -- entity ref = '%s'\n", (const char*)entity );
                                QString text;
                                if ( ( _dtd ) && ( _dtd->getEntity( entity, text ) ) ) {
//                                    printf( "SgmlLexer::data() -- entity text = '%s'\n", (const char*)text );
                                    int textLength = text.length();
                                    int length = _bufEnd - _bufStart;
                                    if ( _bufSize < ( textLength + length + 1 ) ) {
                                        // Resize buffer.
                                        _bufSize = textLength + length + 1;
                                        char* newBuf = new char[_bufSize];
                                        for ( int i = 0; i <= length; i++ ) {
                                            newBuf[_bufSize - length - 1 + i] = _buf[_bufStart + i];
                                        }
                                        delete [] _buf;
                                        _buf = newBuf;
                                        _bufStart = _bufSize - length - 1;
                                        _bufEnd = _bufSize - 1;
                                    }
                                    if ( textLength >= _bufStart ) {
                                        // Move buffer text to end of buffer.
                                        for ( int i = length; i >= 0; i-- ) {
                                            _buf[_bufSize - length - 1 + i] = _buf[_bufStart + i];
                                        }
                                        _bufStart = _bufSize - length - 1;
                                        _bufEnd = _bufSize - 1;
                                    }
                                    
                                    // Prepend the buffer with entity text.
                                    for ( int i = 0; i < textLength; i++ ) {
                                        _buf[_bufStart - textLength + i] = text[i];
                                    }
                                    _bufStart -= textLength;
                                }
                            } else if ( _done ) {
                                // Unterminated entity reference.
                                _bufStart = _bufEnd;
                            } else {
                                // Need more data.
                                return;
                            }
                        } else {
                            doToken( Percent, _bufStart+1 );
                        }
                    } else if ( ch1 == '=' ) {
                        doToken( EqualSign, _bufStart+1 );
                    } else if ( ch1 == '-' ) {
                        if ( ch2 == '(' ) {
                            doToken( ExclusionListStart, _bufStart+2 );
                        } else if ( ch2 == '-' ) {
                            for ( idx1 = _bufStart+2; idx1 < _bufEnd - 2; idx1++ ) {
                                if ( ( _buf[idx1-1] != '-' ) && ( _buf[idx1] == '-' ) && ( _buf[idx1+1] == '-' ) && ( _buf[idx1+2] != '-' ) ) {
                                    break;
                                }
                            }
                            if ( idx1 < ( _bufEnd - 2 ) ) {
                                doToken( Comment, idx1+2 );
                            } else if ( _done ) {
                                // Unterminated comment.
                                doToken( Comment, _bufEnd );
                            } else {
                                // Need more data.
                                return;
                            }
                        } else if ( ( ch2 ) || ( _done ) ) {
                            doToken( MinusSign, _bufStart+1 );
                        } else {
                            // Need more data.
                            return;
                        }
                    } else if ( ch1 == '\'' ) {
                        for ( idx1 = _bufStart+1; ( idx1 < _bufEnd ) && ( _buf[idx1] != '\'' ); idx1++ );
                        if ( idx1 < _bufEnd ) {
                            _bufStart++; // Eat leading quote.
                            doToken( StringLiteral, idx1 );
                            _bufStart++; // Eat trailing quote.
                        } else if ( _done ) {
                            // Unterminated string literal.
                            _bufStart++; // Eat leading quote.
                            doToken( StringLiteral, _bufEnd );
                        } else {
                            // Need more data.
                            return;
                        }
                    } else if ( ch1 == '"' ) {
                        for ( idx1 = _bufStart+1; ( idx1 < _bufEnd ) && ( _buf[idx1] != '"' ); idx1++ );
                        if ( idx1 < _bufEnd ) {
                            _bufStart++; // Eat leading quote.
                            doToken( StringLiteral, idx1 );
                            _bufStart++; // Eat trailing quote.
                        } else if ( _done ) {
                            // Unterminated string literal.
                            _bufStart++; // Eat leading quote.
                            doToken( StringLiteral, _bufEnd );
                        } else {
                            // Need more data.
                            return;
                        }
                    } else if ( ch1 == '<' ) {
                        if ( ch2 == '!' ) {
                            doToken( MarkupDeclOpen, _bufStart+2 );
                        } else if ( ch2 == '/' ) {
                            doToken( EndTagOpen, _bufStart+2 );
                        } else if ( ch2 == '?' ) {
                            doToken( ProcInstOpen, _bufStart+2 );
                        } else if ( ( ch2 ) || ( _done ) ) {
                            doToken( StartTagOpen, _bufStart+1 );
                        } else {
                            // Need more data.
                            return;
                        }
                    } else if ( ch1 == '>' ) {
                        doToken( MarkupClose, _bufStart+1 );
                    } else if ( _bufStart < _bufEnd ) {
                        // This is a problem.  Eat the current character
                        // we don't know what else to do with it.
                        _bufStart++;
                    }
                }
            }
            break;
        }
    }

    if ( _done ) {
        emit done();
        delete this;
    }
}

void SgmlLexer::endOfData()
{
    _done = TRUE;
    data( 0, 0 );
}

void SgmlLexer::mode( Mode m )
{
//    printf( "SgmlLexer::mode() -- changing to mode %d\n", m );
    _mode = m;
}

SgmlLexer::Mode SgmlLexer::mode()
{
    return _mode;
}

void SgmlLexer::doToken( SgmlLexer::Token tok, int end )
{
    int length = end - _bufStart;
    
    if ( _tokenSize < length + 1 ) {
        delete [] _token;
        _tokenSize = length + 1;
        _token = new char[ _tokenSize ];
    }

    for ( int i = 0; i < length; i++ ) {
        _token[i] = _buf[_bufStart+i];
    }
    _token[length] = 0;
    _bufStart = end;

//    printf( "SgmlLexer::doToken() -- emitting token = %d, text = '%s'\n", tok, _token );
    emit token( tok, _token );
}
