40 #ifndef __EST_NGRAMMAR_H__
41 #define __EST_NGRAMMAR_H__
48 #include "EST_String.h"
50 #include "EST_rw_status.h"
51 #include "EST_types.h"
52 #include "EST_FMatrix.h"
54 #include "EST_StringTrie.h"
55 #include "EST_simplestats.h"
61 #define SENTENCE_START_MARKER "!ENTER"
62 #define SENTENCE_END_MARKER "!EXIT"
63 #define OOV_MARKER "!OOV"
65 #define EST_NGRAMBIN_MAGIC 1315402337
68 #define GZIP_FILENAME_EXTENSION "gz"
69 #define COMPRESS_FILENAME_EXTENSION "Z"
72 #define TINY_FREQ 1.0e-10
95 {clear();init(
id,pdf);};
109 void cumulate(
const int index,
const double count=1)
110 {p_pdf.cumulate(index,count);};
111 void cumulate(
const EST_String &word,
const double count=1)
112 {p_pdf.cumulate(word,count);};
115 int id()
const {
return p_id; };
119 {
return p_pdf.probability(w);}
120 double probability(
int w)
const {
return p_pdf.probability(w);}
122 {
return p_pdf.frequency(w);}
123 double frequency(
int w)
const {
return p_pdf.frequency(w);}
124 const EST_String &most_probable(
double *prob = NULL)
const
125 {
return p_pdf.most_probable(prob);}
137 double backoff_weight;
149 {clear();init(d,level);};
151 {clear();init(pdf,level);};
164 const double count=1);
166 const double count=1);
171 {
return p_pdf.probability(w);}
173 {
return p_pdf.frequency(w);}
174 const EST_String &most_probable(
double *prob = NULL)
const
175 {
return p_pdf.most_probable(prob);}
177 const int level()
const {
return p_level;}
197 const double threshold)
const;
198 const double get_backoff_weight()
const {
return backoff_weight; }
199 const double get_backoff_weight(
const EST_StrVector &words)
const;
200 bool set_backoff_weight(
const EST_StrVector &words,
const double w);
203 void print_freqs(ostream &os,
const int order,
EST_String followers=
"");
211 EST_write_status save_ngram_cstr_ascii(
const EST_String filename,
EST_Ngrammar &n,
const bool trace=
false,
double floor=0.0);
212 EST_write_status save_ngram_cstr_bin(
const EST_String filename,
EST_Ngrammar &n,
const bool trace=
false,
double floor=0.0);
215 bool Good_Turing_smooth(
EST_Ngrammar &n,
int maxcount,
int mincount=0);
216 void Good_Turing_discount(
EST_Ngrammar &ngrammar,
const int maxcount,
const double default_discount=0.5);
223 enum representation_t {sparse, dense, backoff};
228 enum entry_t {frequencies, log_frequencies};
237 double p_number_of_sentences;
244 representation_t p_representation;
245 entry_t p_entry_type;
250 bool init_sparse_representation();
253 bool init_dense_representation();
261 double backoff_threshold;
264 double backoff_unigram_floor_freq;
272 const double get_backoff_discount(
const int order,
const double freq)
const;
274 bool init_backoff_representation();
276 void backoff_restore_unigram_states();
279 int find_dense_state_index(
const EST_IVector &words,
int index=0)
const;
282 const EST_StrVector &make_ngram_from_index(
const int i)
const;
297 {
return words(p_order-1); }
299 {
return words(p_order-1); }
303 bool sparse_to_dense();
304 bool dense_to_sparse();
309 void freqs_to_probs();
336 bool p_init(
int o, representation_t r);
340 bool oov_preprocess(
const EST_String &filename,
352 const double backoff_reverse_probability_sub(
const EST_StrVector &words,
355 const bool trace=
false)
const;
356 const double backoff_reverse_probability(
const EST_StrVector &words)
const;
358 double *prob = NULL)
const;
372 void *params,
const int level);
380 default_values(); init(o,r,wordlist);
388 default_values(); init(o,r,wordlist,predlist);
393 default_values(); init(o,r,v);
397 void default_values();
399 bool init(
int o, representation_t r,
401 bool init(
int o, representation_t r,
405 bool init(
int o, representation_t r,
409 int num_states(
void)
const {
return p_num_states;}
410 double samples(
void)
const {
return p_num_samples;}
411 int order()
const {
return p_order; }
412 int get_vocab_length()
const {
return vocab?vocab->
length():0; }
414 int get_vocab_word(
const EST_String &s)
const;
415 int get_pred_vocab_length()
const {
return pred_vocab->length(); }
416 EST_String get_pred_vocab_word(
int i)
const {
return pred_vocab->name(i); }
417 int get_pred_vocab_word(
const EST_String &s)
const
418 {
return pred_vocab->name(s); }
419 int closed_vocab()
const {
return !allow_oov; }
420 entry_t entry_type()
const {
return p_entry_type;}
421 representation_t representation()
const
422 {
return p_representation;}
426 const EST_String &prev = SENTENCE_START_MARKER,
427 const EST_String &prev_prev = SENTENCE_END_MARKER,
431 const int mincount=1,
432 const int maxcount=10);
436 const double count=1);
439 const double count=1);
443 void make_htk_compatible();
446 EST_read_status load(
const EST_String &filename);
448 EST_write_status save(
const EST_String &filename,
450 const bool trace=
false,
453 int wordlist_index(
const EST_String &word,
const bool report=
true)
const;
454 const EST_String &wordlist_index(
int i)
const;
455 int predlist_index(
const EST_String &word)
const;
456 const EST_String &predlist_index(
int i)
const;
459 bool set_entry_type(entry_t new_type);
460 bool set_representation(representation_t new_representation);
465 double probability(
const EST_StrVector &words,
bool force=
false,
466 const bool trace=
false)
const;
467 double frequency(
const EST_StrVector &words,
bool force=
false,
468 const bool trace=
false)
const;
471 double *prob,
int *state)
const;
473 {
double p;
int state;
return predict(words,&p,&state); }
475 {
int state;
return predict(words,prob,&state); }
479 {
double p;
int state;
return predict(words,&p,&state); }
481 {
int state;
return predict(words,prob,&state); }
485 int find_next_state_id(
int state,
int word)
const;
494 bool force=
false)
const;
495 double reverse_probability(
const EST_IVector &words,
496 bool force=
false)
const;
523 bool ngram_exists(
const EST_StrVector &words,
const double threshold)
const;
524 const double get_backoff_weight(
const EST_StrVector &words)
const;
525 bool set_backoff_weight(
const EST_StrVector &words,
const double w);
527 void print_freqs(ostream &os,
double floor=0.0);
531 friend ostream& operator<<(ostream& s,
EST_Ngrammar &n);
532 friend EST_read_status load_ngram_htk_ascii(
const EST_String filename,
534 friend EST_read_status load_ngram_htk_binary(
const EST_String filename,
536 friend EST_read_status load_ngram_arpa(
const EST_String filename,
539 friend EST_read_status load_ngram_cstr_ascii(
const EST_String filename,
541 friend EST_read_status load_ngram_cstr_bin(
const EST_String filename,
544 friend EST_write_status save_ngram_htk_ascii_sub(
const EST_String &word,
548 friend EST_write_status save_ngram_htk_ascii(
const EST_String filename,
554 friend EST_write_status save_ngram_cstr_ascii(
const EST_String filename,
558 friend EST_write_status save_ngram_cstr_bin(
const EST_String filename,
562 friend EST_write_status save_ngram_arpa(
const EST_String filename,
564 friend EST_write_status save_ngram_arpa_sub(ostream *ost,
567 friend EST_write_status save_ngram_wfst(
const EST_String filename,
575 friend bool Good_Turing_smooth(
EST_Ngrammar &n,
int maxcount,
int mincount);
576 friend void Good_Turing_discount(
EST_Ngrammar &ngrammar,
const int maxcount,
577 const double default_discount);
579 friend void fs_build_backoff_ngrams(
EST_Ngrammar *backoff_ngrams,
581 friend int fs_backoff_smooth(
EST_Ngrammar *backoff_ngrams,
586 bool compute_backoff_weights(
const int mincount=1,
587 const int maxcount=10);
592 friend class EST_BackoffNgrammar;
611 const EST_String &prev = SENTENCE_START_MARKER,
612 const EST_String &prev_prev = SENTENCE_END_MARKER,
613 const EST_String &last = SENTENCE_END_MARKER);
617 #endif // __EST_NGRAMMAR_H__
EST_Item * root(const EST_Item *n)
return root node of treeprevious sibling (sister) of n
INLINE int length() const
number of items in vector.
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
A string tree index class for indexing arbitrary objects by strings of characters.
Utility EST_String Functions header file.