/*
  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
 
  $Id: japanese_tokenizer.cpp,v 1.10 2004/06/21 09:34:35 taku-ku Exp $;

  Copyright (C) 2001-2004 Taku Kudo <taku-ku@is.aist-nara.ac.jp>
  This is free software with ABSOLUTELY NO WARRANTY.
  
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/  
#include "tokenizer.h"
#include "common.h"
#include "codeconv.h"
#include "japanese_tokenizer.h"

namespace MeCab 
{
  bool JapaneseTokenizer::open (Param &param)
  {
    // call parent
    if (! Tokenizer::open (param)) throw std::runtime_error (_what);

    std::string tmp = param.getProfileString ("charset");
    charset = EUC_JP;
    if      (tmp == "sjis" || tmp == "shift-jis" || tmp == "shift_jis") charset = SHIFT_JIS;
    else if (tmp == "euc"  || tmp == "euc_jp" || tmp == "euc-jp") charset = EUC_JP;
    else if (tmp == "utf8" || tmp == "utf_8") charset = UTF8; // not implimented
    else {
      _what = std::string ("JapaneseTokenizer::JapaneseTokenizer: unknown charset > ") + tmp;
      throw std::runtime_error (_what);
    }

    // Ref: Understanding ܸ Page 298- 
    // JISX 2080-1990 Kuten code table
    unsigned char kuten_table [95][95];
    for (int i = 0; i <= 94; ++i) 
      for (int j = 0; j <= 94; ++j) 
	kuten_table[i][j] = OTHER;

    // initilize all char as KANJI
    for (int i = 16; i <= 94; ++i) 
      for (int j = 1; j <= 94; ++j) 
	kuten_table[i][j] = KANJI;

    // kuten_table[1][1] = SPACE; // 2byte Space
    for (int i = 2;  i <= 94; ++i) kuten_table[1][i] = SYMBOL;
    for (int i = 1;  i <= 84; ++i) kuten_table[2][i] = SYMBOL;
    for (int i = 16; i <= 25; ++i) kuten_table[3][i] = ALPHANUMERIC;
    for (int i = 33; i <= 90; ++i) kuten_table[3][i] = ALPHA;
    for (int i = 1;  i <= 83; ++i) kuten_table[4][i] = HIRAGANA;
    for (int i = 1;  i <= 86; ++i) kuten_table[5][i] = KATAKANA;
    for (int i = 1;  i <= 56; ++i) kuten_table[6][i] = GREEK;
    for (int i = 1;  i <= 81; ++i) kuten_table[7][i] = CYRILLIC;
     
    kuten_table[1][28] = KATAKANA; // 

    // ascii table
    for (int i = 0;   i <= 255;  ++i) ascii_table[i] = OTHER;
    for (int i = 33;  i <= 47;   ++i) ascii_table[i] = SYMBOL;
    for (int i = 48;  i <= 57;   ++i) ascii_table[i] = ALPHANUMERIC;
    for (int i = 58;  i <= 64;   ++i) ascii_table[i] = SYMBOL;
    for (int i = 65;  i <= 90;   ++i) ascii_table[i] = ALPHA;
    for (int i = 91;  i <= 96;   ++i) ascii_table[i] = SYMBOL;
    for (int i = 97;  i <= 122;  ++i) ascii_table[i] = ALPHA;
    for (int i = 123; i <= 126;  ++i) ascii_table[i] = SYMBOL; 

    ascii_table[32] = SPACE; // ' '
    ascii_table[13] = SPACE; // '\r'
    ascii_table[10] = SPACE; // '\n'
    ascii_table[9]  = SPACE; // '\t'

    for (int i = 0; i < 255; ++i)
      for (int j = 0; j < 255; ++j)
	ja_table[i][j] = OTHER;

    // for all kuten tabele 
    for (unsigned int i = 1; i <= 94 ; ++i) {
      for (unsigned int j = 1; j <= 94 ; ++j) {
	unsigned int h = i;
	unsigned int l = j;
	switch (charset) {
	case EUC_JP:    kuten2euc  (h,l); break;
	case SHIFT_JIS: kuten2sjis (h,l); break;
	case UTF8:      kuten2jis  (h,l); break; // use UTF8 -> JISX0208 table
	default:        kuten2euc  (h,l); break; // never be here
	}
	ja_table[h][l] = kuten_table[i][j];
      }
    }

    // EUC-hankaku-kana
    if (charset == EUC_JP) 
      for (int i = 166; i <= 222;  ++i) ja_table[142][i] = HALFKATAKANA;
    else if (charset == SHIFT_JIS)
      for (int i = 166; i <= 222;  ++i) ascii_table[i] = HALFKATAKANA;      

    return Tokenizer::open (param);
  }
   
  Node *JapaneseTokenizer::lookup (const char *begin, const char *end)
  {
    Node *resultNode = 0;
    int char_class;
    unsigned int next;
     
    char *begin2 = skipCharClass (begin, end, SPACE, char_class, next); 

    Token **t = dic.commonPrefixSearch (begin2, end - begin2);
    for (;*t;  t++) {
      Node *newNode     = getNewNode ();
      newNode->stat     = 0;        
      newNode->token    = *t;
      newNode->length   = (*t)->length;
      newNode->surface  = begin2;
      newNode->end      = begin2 - begin + (*t)->length;
      newNode->rnext    = resultNode;
      newNode->feature  = const_cast<char *>(dic.getFeature () + (*t)->feature);
      resultNode = newNode;
    }
     
    if (resultNode && (char_class == HIRAGANA || char_class == KANJI))
       return resultNode;

    char *begin3;
    switch (char_class) {
    case HIRAGANA:
    case KANJI:
    case OTHER:
      begin3 = begin2 + next; // single char is defined as UNKNOWN
      break; 
    default:
      begin3 = skipCharClass (begin2 + next, end, char_class); // group by same class
      break; 
    }

    Node *newNode    = getNewNode ();
    newNode->stat    = 1; 
    newNode->token   = &unkToken;
    newNode->surface = begin2;     
    newNode->length  = begin3 - begin2;
    newNode->end     = begin3 - begin;
    newNode->feature = unkFeature;
    newNode->rnext   = resultNode;
    return newNode;
  }
};
