Edinburgh Speech Tools  2.1-release
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
EST_Ngrammar.h
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A general class for ngrams (bi-gram, tri-gram etc) */
38 /* */
39 /*=======================================================================*/
40 #ifndef __EST_NGRAMMAR_H__
41 #define __EST_NGRAMMAR_H__
42 
43 #include <cstdarg>
44 #include <cstdlib>
45 
46 using namespace std;
47 
48 #include "EST_String.h"
49 #include "EST_Val.h"
50 #include "EST_rw_status.h"
51 #include "EST_types.h"
52 #include "EST_FMatrix.h"
53 #include "EST_TList.h"
54 #include "EST_StringTrie.h"
55 #include "EST_simplestats.h"
56 #include "EST_PST.h"
57 #include "EST_string_aux.h"
58 #include "EST_math.h"
59 
60 // HTK style
61 #define SENTENCE_START_MARKER "!ENTER"
62 #define SENTENCE_END_MARKER "!EXIT"
63 #define OOV_MARKER "!OOV"
64 
65 #define EST_NGRAMBIN_MAGIC 1315402337
66 
67 // for compressed save/load
68 #define GZIP_FILENAME_EXTENSION "gz"
69 #define COMPRESS_FILENAME_EXTENSION "Z"
70 
71 // Ultimate floor
72 #define TINY_FREQ 1.0e-10
73 
74 // ngram state - represents the N-1 word history and contains
75 // the pdf of the next word
76 
78 
79 private:
80 
81 protected:
83  int p_id; // a 'name'
84 
85 public:
87 
88  p_pdf()
89 
90  {
91  init();
92  };
93  EST_NgrammarState(int id,EST_Discrete *d){clear();init(id,d);};
95  {clear();init(id,pdf);};
97  EST_NgrammarState(const EST_NgrammarState *const s);
99 
100  EST_IVector path; // how we got here
101 
102  // initialise
103  void clear();
104  void init();
105  void init(int id, EST_Discrete *d);
106  void init(int id, const EST_DiscreteProbDistribution &pdf);
107 
108  // build
109  void cumulate(const int index, const double count=1)
110  {p_pdf.cumulate(index,count);};
111  void cumulate(const EST_String &word, const double count=1)
112  {p_pdf.cumulate(word,count);};
113 
114  // access
115  int id() const {return p_id; };
116  const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };
117  EST_DiscreteProbDistribution &pdf() {return p_pdf; };
118  double probability(const EST_String &w) const
119  {return p_pdf.probability(w);}
120  double probability(int w) const {return p_pdf.probability(w);}
121  double frequency(const EST_String &w) const
122  {return p_pdf.frequency(w);}
123  double frequency(int w) const {return p_pdf.frequency(w);}
124  const EST_String &most_probable(double *prob = NULL) const
125  {return p_pdf.most_probable(prob);}
126 
127 friend ostream& operator<<(ostream& s, const EST_NgrammarState &a);
128 
129 };
130 
132 
133 private:
134 
135 protected:
136  int p_level; // = 0 for root node
137  double backoff_weight;
139  EST_StringTrie children;
140 
141  EST_BackoffNgrammarState* add_child(const EST_Discrete *d,
142  const EST_StrVector &words);
143  EST_BackoffNgrammarState* add_child(const EST_Discrete *d,
144  const EST_IVector &words);
145 public:
147  { init(); };
148  EST_BackoffNgrammarState(const EST_Discrete *d,int level)
149  {clear();init(d,level);};
151  {clear();init(pdf,level);};
155 
156  // initialise
157  void clear();
158  void init();
159  void init(const EST_Discrete *d, int level);
160  void init(const EST_DiscreteProbDistribution &pdf, int level);
161 
162  // build
163  bool accumulate(const EST_StrVector &words,
164  const double count=1);
165  bool accumulate(const EST_IVector &words,
166  const double count=1);
167  // access
168  const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };
169  EST_DiscreteProbDistribution &pdf() {return p_pdf; };
170  double probability(const EST_String &w) const
171  {return p_pdf.probability(w);}
172  double frequency(const EST_String &w) const
173  {return p_pdf.frequency(w);}
174  const EST_String &most_probable(double *prob = NULL) const
175  {return p_pdf.most_probable(prob);}
176 
177  const int level() const {return p_level;}
178 
179  EST_BackoffNgrammarState* get_child(const EST_String &word) const
180  {
181  return (EST_BackoffNgrammarState*)children.lookup(word);
182  }
183  EST_BackoffNgrammarState* get_child(const int word) const
184  {
185  return (EST_BackoffNgrammarState*)children.lookup(p_pdf.get_discrete()->name(word));
186  }
187 
188  void remove_child(EST_BackoffNgrammarState *child,
189  const EST_String &name);
190 
191  // recursive delete of contents and children
192  void zap();
193 
194  const EST_BackoffNgrammarState *const get_state(const EST_StrVector &words) const;
195 
196  bool ngram_exists(const EST_StrVector &words,
197  const double threshold) const;
198  const double get_backoff_weight() const {return backoff_weight; }
199  const double get_backoff_weight(const EST_StrVector &words) const;
200  bool set_backoff_weight(const EST_StrVector &words, const double w);
201  void frequency_of_frequencies(EST_DVector &ff);
202 
203  void print_freqs(ostream &os,const int order,EST_String followers="");
204 
205 friend ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a);
206 
207 };
208 
209 class EST_Ngrammar;
210 EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor=0.0);
211 EST_write_status save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n, const bool trace=false, double floor=0.0);
212 EST_write_status save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, const bool trace=false, double floor=0.0);
213 void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order=0);
214 void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order=0);
215 bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount=0);
216 void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount, const double default_discount=0.5);
217 
219 
220 public:
221 
222  // 3 representations : sparse, dense and backed off. User specifies which.
223  enum representation_t {sparse, dense, backoff};
224 
225  // now only keep frequencies (or log frequencies)
226  // probabilities (or log probabilities) can be done
227  // on the fly quickly enough
228  enum entry_t {frequencies, log_frequencies};
229 
230 protected:
231 
232  // each instance of an EST_Ngrammar is a grammar of fixed order
233  // e.g. a bigram (order = 2)
234  int p_order;
235  int p_num_samples;
236 
237  double p_number_of_sentences; // which were used to build this grammar
238 
239 
240  EST_String p_sentence_start_marker;
241  EST_String p_sentence_end_marker;
242 
243  // only one representation in use at a time
244  representation_t p_representation;
245  entry_t p_entry_type;
246 
247  // sparse representation is a tree structure
248  // holding only those N-grams which were seen
249  EST_PredictionSuffixTree sparse_representation;
250  bool init_sparse_representation();
251 
252  // dense representation is just an array of all states
253  bool init_dense_representation();
254 
255  // backoff representation is also a tree structure
256  // but the root state pdf is the most recent word in the
257  // ngram and going down the tree is going back in time....
258  // here is the root node :
259  EST_BackoffNgrammarState *backoff_representation;
260 
261  double backoff_threshold;
262 
263  // need a non-zero unigram floor to enable backing off
264  double backoff_unigram_floor_freq;
265 
266  // instead of simple discounting, we have a (possibly) different
267  // discount per order and per frequency
268  // e.g. backoff_discount[2](4) contains the discount to be
269  // applied to a trigram frequency of 4
270  // backoff_discount[0] is unused (we don't discount unigrams)
271  EST_DVector *backoff_discount;
272  const double get_backoff_discount(const int order, const double freq) const;
273 
274  bool init_backoff_representation();
275  void prune_backoff_representation(EST_BackoffNgrammarState *start_state=NULL); // remove any zero frequency branches
276  void backoff_restore_unigram_states();
277  int p_num_states; // == p_vocab_size ^ (p_ord-1) if fully dense
278  EST_NgrammarState *p_states; // state id is index into this array
279  int find_dense_state_index(const EST_IVector &words, int index=0) const;
280 
281  // and the reverse
282  const EST_StrVector &make_ngram_from_index(const int i) const;
283 
284  // vocabulary
285  EST_Discrete *vocab;
286  EST_Discrete *pred_vocab; // may be different from state vocab
287  bool init_vocab(const EST_StrList &wordlist);
288  bool init_vocab(const EST_StrList &word_list,
289  const EST_StrList &pred_list);
290 
291  // make sure vocab matches a given wordlist
292  bool check_vocab(const EST_StrList &wordlist);
293 
294  EST_DiscreteProbDistribution vocab_pdf; // overall pdf
295 
296  const EST_String &lastword(const EST_StrVector &words) const
297  { return words(p_order-1); }
298  const int lastword(const EST_IVector &words) const
299  { return words(p_order-1); }
300  // are we allowing out-of-vocabulary words, or is the vocabulary closed?
301  bool allow_oov;
302 
303  bool sparse_to_dense();
304  bool dense_to_sparse();
305 
306  // these aren't sorted yet ...
307  void take_logs();
308  void take_exps();
309  void freqs_to_probs(); // just calls normalise
310 
311  bool build_sparse(const EST_String &filename,
312  const EST_String &prev,
313  const EST_String &prev_prev,
314  const EST_String &last);
315  // for dense and backoff
316  bool build_ngram(const EST_String &filename,
317  const EST_String &prev,
318  const EST_String &prev_prev,
319  const EST_String &last,
320  const EST_String &input_format);
321 
322  // go through all matching ngrams ( *(ngram[i])="" matches anything )
323  void iterate(EST_StrVector &words,
324  void (*function)(EST_Ngrammar *n,
325  EST_StrVector &words,
326  void *params),
327  void *params);
328 
329  // same, but with a constant Ngrammar
330  void const_iterate(EST_StrVector &words,
331  void (*function)(const EST_Ngrammar *const n,
332  EST_StrVector &words,
333  void *params),
334  void *params) const;
335 
336  bool p_init(int o, representation_t r);
337 
338  // new filename returned of we had to copy stdin to a
339  // temporary file - must delete it later !
340  bool oov_preprocess(const EST_String &filename,
341  EST_String &new_filename,
342  const EST_String &what);
343 
344 
345  const EST_NgrammarState &find_state_const(const EST_StrVector &words)const;
346  EST_NgrammarState &find_state(const EST_StrVector &words);
347  const EST_NgrammarState &find_state_const(const EST_IVector &words) const;
348  EST_NgrammarState &find_state(const EST_IVector &words);
349 
350  // special versions for backoff grammars
351  const EST_DiscreteProbDistribution &backoff_prob_dist(const EST_StrVector &words) const;
352  const double backoff_reverse_probability_sub(const EST_StrVector &words,
353  const EST_BackoffNgrammarState *root) const;
354  const double backoff_probability(const EST_StrVector &words,
355  const bool trace=false) const;
356  const double backoff_reverse_probability(const EST_StrVector &words) const;
357  const EST_String & backoff_most_probable(const EST_StrVector &words,
358  double *prob = NULL) const;
359 
360  // backoff representation isn't a nice array of states
361  // so use this to visit every node in the tree
362  // and apply the function to that node
363  void backoff_traverse(EST_BackoffNgrammarState *start_state,
364  void (*function)(EST_BackoffNgrammarState *s,
365  void *params),
366  void *params);
367 
368  // visit every node at a given level
369  void backoff_traverse(EST_BackoffNgrammarState *start_state,
370  void (*function)(EST_BackoffNgrammarState *s,
371  void *params),
372  void *params, const int level);
373 public:
374 
375  EST_Ngrammar() {default_values();}
376 
377  EST_Ngrammar(int o, representation_t r,
378  const EST_StrList &wordlist)
379  {
380  default_values(); init(o,r,wordlist);
381  }
382 
383  // When state trans vocab differs from prediction vocab
384  EST_Ngrammar(int o, representation_t r,
385  const EST_StrList &wordlist,
386  const EST_StrList &predlist)
387  {
388  default_values(); init(o,r,wordlist,predlist);
389  }
390 
391  EST_Ngrammar(int o, representation_t r, EST_Discrete &v)
392  {
393  default_values(); init(o,r,v);
394  }
395  ~EST_Ngrammar();
396 
397  void default_values();
398  void clear();
399  bool init(int o, representation_t r,
400  const EST_StrList &wordlist);
401  bool init(int o, representation_t r,
402  const EST_StrList &wordlist,
403  const EST_StrList &predlist);
404  bool init(int o, representation_t r, EST_Discrete &v);
405  bool init(int o, representation_t r,
406  EST_Discrete &v,EST_Discrete &pv);
407 
408  // access
409  int num_states(void) const { return p_num_states;}
410  double samples(void) const { return p_num_samples;}
411  int order() const { return p_order; }
412  int get_vocab_length() const { return vocab?vocab->length():0; }
413  EST_String get_vocab_word(int i) const;
414  int get_vocab_word(const EST_String &s) const;
415  int get_pred_vocab_length() const { return pred_vocab->length(); }
416  EST_String get_pred_vocab_word(int i) const { return pred_vocab->name(i); }
417  int get_pred_vocab_word(const EST_String &s) const
418  { return pred_vocab->name(s); }
419  int closed_vocab() const {return !allow_oov; }
420  entry_t entry_type() const {return p_entry_type;}
421  representation_t representation() const
422  { return p_representation;}
423 
424  // build
425  bool build(const EST_StrList &filenames,
426  const EST_String &prev = SENTENCE_START_MARKER,
427  const EST_String &prev_prev = SENTENCE_END_MARKER,
428  const EST_String &last = SENTENCE_END_MARKER,
429  const EST_String &input_format = "",
430  const EST_String &oov_mode = "",
431  const int mincount=1,
432  const int maxcount=10);
433 
434  // Accumulate ngrams
435  void accumulate(const EST_StrVector &words,
436  const double count=1);
437  //const int index=0);
438  void accumulate(const EST_IVector &words,
439  const double count=1);
440  //const int index=0);
441 
442  // hack - fix enter/exit probs s.t. P(...,!ENTER)=P(!EXIT,...)=0, for all x
443  void make_htk_compatible();
444 
445  // I/O functions
446  EST_read_status load(const EST_String &filename);
447  EST_read_status load(const EST_String &filename, const EST_StrList &wordlist);
448  EST_write_status save(const EST_String &filename,
449  const EST_String type="cstr_ascii",
450  const bool trace=false,
451  double floor=0.0);
452 
453  int wordlist_index(const EST_String &word, const bool report=true) const;
454  const EST_String &wordlist_index(int i) const;
455  int predlist_index(const EST_String &word) const;
456  const EST_String &predlist_index(int i) const;
457 
458  // set
459  bool set_entry_type(entry_t new_type);
460  bool set_representation(representation_t new_representation);
461 
462  // probability distributions
463  // -------------------------
464  // flag 'force' forces computation of probs on-the-fly if necessary
465  double probability(const EST_StrVector &words, bool force=false,
466  const bool trace=false) const;
467  double frequency(const EST_StrVector &words, bool force=false,
468  const bool trace=false) const;
469 
470  const EST_String &predict(const EST_StrVector &words,
471  double *prob,int *state) const;
472  const EST_String &predict(const EST_StrVector &words) const
473  {double p; int state; return predict(words,&p,&state); }
474  const EST_String &predict(const EST_StrVector &words,double *prob) const
475  {int state; return predict(words,prob,&state); }
476 
477  const EST_String &predict(const EST_IVector &words,double *prob,int *state) const;
478  const EST_String &predict(const EST_IVector &words) const
479  {double p; int state; return predict(words,&p,&state); }
480  const EST_String &predict(const EST_IVector &words,double *prob) const
481  {int state; return predict(words,prob,&state); }
482 
483  int find_state_id(const EST_StrVector &words) const;
484  int find_state_id(const EST_IVector &words) const;
485  int find_next_state_id(int state, int word) const;
486  // fast versions for common N
487  //const double probability(const EST_String w1);
488  //const double probability(const EST_String w1,const EST_String w2);
489  //const double probability(const EST_String w1,const EST_String w2,
490  //const EST_String w2);
491 
492  // reverse - probability of words[0..order-2] given word[order-1]
493  double reverse_probability(const EST_StrVector &words,
494  bool force=false) const;
495  double reverse_probability(const EST_IVector &words,
496  bool force=false) const;
497 
498  // predict, where words has 'order' elements and the last one is "" or NULL
499  const EST_DiscreteProbDistribution &prob_dist(const EST_StrVector &words) const;
500  const EST_DiscreteProbDistribution &prob_dist(const EST_IVector &words) const;
501  const EST_DiscreteProbDistribution &prob_dist(int state) const;
502 
503 // bool stats(const EST_String filename,
504 // double &raw_entropy, double &count,
505 // double &entropy, double &perplexity,
506 // const EST_String &prev = SENTENCE_START_MARKER,
507 // const EST_String &prev_prev = SENTENCE_END_MARKER,
508 // const EST_String &last = SENTENCE_END_MARKER,
509 // const EST_String &input_format = "") const;
510 
511  void fill_window_start(EST_IVector &window,
512  const EST_String &prev,
513  const EST_String &prev_prev) const;
514 
515  void fill_window_start(EST_StrVector &window,
516  const EST_String &prev,
517  const EST_String &prev_prev) const;
518 
519  // why anybody would want to do this ....
520  //EST_Ngrammar &operator =(const EST_Ngrammar &a);
521 
522  bool ngram_exists(const EST_StrVector &words) const;
523  bool ngram_exists(const EST_StrVector &words, const double threshold) const;
524  const double get_backoff_weight(const EST_StrVector &words) const;
525  bool set_backoff_weight(const EST_StrVector &words, const double w);
526 
527  void print_freqs(ostream &os,double floor=0.0);
528 
529  // i/o functions
530  // -------------
531  friend ostream& operator<<(ostream& s, EST_Ngrammar &n);
532  friend EST_read_status load_ngram_htk_ascii(const EST_String filename,
533  EST_Ngrammar &n);
534  friend EST_read_status load_ngram_htk_binary(const EST_String filename,
535  EST_Ngrammar &n);
536  friend EST_read_status load_ngram_arpa(const EST_String filename,
537  EST_Ngrammar &n,
538  const EST_StrList &vocab);
539  friend EST_read_status load_ngram_cstr_ascii(const EST_String filename,
540  EST_Ngrammar &n);
541  friend EST_read_status load_ngram_cstr_bin(const EST_String filename,
542  EST_Ngrammar &n);
543 
544  friend EST_write_status save_ngram_htk_ascii_sub(const EST_String &word,
545  ostream *ost,
546  EST_Ngrammar &n,
547  double floor);
548  friend EST_write_status save_ngram_htk_ascii(const EST_String filename,
549  EST_Ngrammar &n,
550  double floor);
551 
552  //friend EST_write_status save_ngram_htk_binary(const EST_String filename,
553  // EST_Ngrammar &n);
554  friend EST_write_status save_ngram_cstr_ascii(const EST_String filename,
555  EST_Ngrammar &n,
556  const bool trace,
557  double floor);
558  friend EST_write_status save_ngram_cstr_bin(const EST_String filename,
559  EST_Ngrammar &n,
560  const bool trace,
561  double floor);
562  friend EST_write_status save_ngram_arpa(const EST_String filename,
563  EST_Ngrammar &n);
564  friend EST_write_status save_ngram_arpa_sub(ostream *ost,
565  EST_Ngrammar &n,
566  const EST_StrVector &words);
567  friend EST_write_status save_ngram_wfst(const EST_String filename,
568  EST_Ngrammar &n);
569 
570  // Auxiliary functions
571 
572  // smoothing
573 friend void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order);
574 friend void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order);
575 friend bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount);
576 friend void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,
577  const double default_discount);
578 
579 friend void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams,
580  EST_Ngrammar &ngram);
581 friend int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,
582  EST_Ngrammar &ngram, int smooth_thresh);
583 
584  // frequencies below mincount get backed off
585  // frequencies above maxcount are not smoothed(discounted)
586  bool compute_backoff_weights(const int mincount=1,
587  const int maxcount=10);
588 
589 
590  bool merge(EST_Ngrammar &n,float weight);
591 
592 friend class EST_BackoffNgrammar;
593 
594 };
595 
596 void Ngram_freqsmooth(EST_Ngrammar &ngram,
597  int smooth_thresh1,
598  int smooth_thresh2);
599 
600 // utils
601 void slide(EST_IVector &i, const int l);
602 void slide(EST_StrVector &i, const int l);
603 
604 bool test_stats(EST_Ngrammar &ngram,
605  const EST_String &filename,
606  double &raw_entropy,
607  double &count,
608  double &entropy,
609  double &perplexity,
610  const EST_String &input_format,
611  const EST_String &prev = SENTENCE_START_MARKER,
612  const EST_String &prev_prev = SENTENCE_END_MARKER,
613  const EST_String &last = SENTENCE_END_MARKER);
614 
615 VAL_REGISTER_CLASS_DCLS(ngrammar,EST_Ngrammar)
616 
617 #endif // __EST_NGRAMMAR_H__
EST_Item * root(const EST_Item *n)
return root node of treeprevious sibling (sister) of n
INLINE int length() const
number of items in vector.
Definition: EST_TVector.h:250
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
Definition: EST_DMatrix.h:118
A string tree index class for indexing arbitrary objects by strings of characters.
Utility EST_String Functions header file.