Edinburgh Speech Tools  2.1-release
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
scfg_make_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* Build a stochastic context feee grammar with N non-terminals and */
37 /* M terminals specific as lists or numbers */
38 /* Probabilities are either even or random on rules and specified as */
39 /* probs or -log prob */
40 /* */
41 /*=======================================================================*/
42 #include <cstdlib>
43 #include <cstdio>
44 #include <iostream>
45 #include <fstream>
46 #include <cstring>
47 #include "EST.h"
48 #include "EST_SCFG.h"
49 #include "siod.h"
50 
51 EST_String outfile = "-";
52 EST_String domain = "nlogp";
53 EST_String values = "equal";
54 
55 static int scfg_make_main(int argc, char **argv);
56 
57 static void load_symbols(EST_StrList &syms,const EST_String &filename);
58 static void make_symbols(EST_StrList &syms,int n,const EST_String &prefix);
59 static LISP assign_probs(LISP rules, const EST_String &domain,
60  const EST_String &values);
61 static LISP make_all_rules(const EST_StrList &NonTerminals,
62  const EST_StrList &Terminals);
63 static void generate_probs(double *probs,int num);
64 
65 
66 
67 
68 int main(int argc, char **argv)
69 {
70 
71  scfg_make_main(argc,argv);
72 
73  exit(0);
74  return 0;
75 }
76 
77 static int scfg_make_main(int argc, char **argv)
78 {
79  // Top level function generates a probabilistic grammar
80  EST_Option al;
81  EST_StrList files;
82  EST_StrList NonTerminals, Terminals;
83  LISP rules,r;
84  FILE *fd;
85 
86  parse_command_line
87  (argc, argv,
88  EST_String("[options]\n")+
89  "Summary: Build a stochastic context free grammar\n"+
90  "-nonterms <string> Number of nonterminals or file containing them\n"+
91  "-terms <string> Number of terminals or file containing them\n"+
92  "-domain <string> {nlogp}\n"+
93  " Values to be nlogp (negative log probabilities)\n"+
94  " or prob (probabilities)\n"+
95  "-values <string> {equal}\n"+
96  " General initial scores on rules as equal or\n"
97  " random\n"+
98  "-heap <int> {500000}\n"+
99  " Set size of Lisp heap, only needed for large grammars\n"+
100  "-o <ofile> File to save grammar (default stdout)\n",
101  files, al);
102 
103  if (al.present("-o"))
104  outfile = al.val("-o");
105  else
106  outfile = "-";
107 
108  if (al.present("-domain"))
109  {
110  if (al.val("-domain") == "nlogp")
111  domain = "nlogp";
112  else if (al.val("-domain") == "prob")
113  domain = "prob";
114  else
115  {
116  cerr << "scfg_make: domain must be nlogp or prob" << endl;
117  exit(1);
118  }
119  }
120 
121  if (al.present("-values"))
122  {
123  if (al.val("-values") == "equal")
124  values = "equal";
125  else if (al.val("-values") == "random")
126  values = "random";
127  else
128  {
129  cerr << "scfg_make: values must be equal or random" << endl;
130  exit(1);
131  }
132  }
133 
134  if (al.present("-nonterms"))
135  {
136  if (al.val("-nonterms").matches(RXint))
137  make_symbols(NonTerminals,al.ival("-nonterms"),"NT");
138  else
139  load_symbols(NonTerminals,al.val("-nonterms"));
140  }
141  else
142  {
143  cerr << "scfg_make: no nonterminals specified" << endl;
144  exit(1);
145  }
146 
147  if (al.present("-terms"))
148  {
149  if (al.val("-terms").matches(RXint))
150  make_symbols(Terminals,al.ival("-terms"),"T");
151  else
152  load_symbols(Terminals,al.val("-terms"));
153  }
154  else
155  {
156  cerr << "scfg_make: no terminals specified" << endl;
157  exit(1);
158  }
159 
160  siod_init(al.ival("-heap"));
161 
162  rules = make_all_rules(NonTerminals,Terminals);
163  rules = assign_probs(rules,domain,values);
164 
165  if (outfile == "-")
166  fd = stdout;
167  else
168  {
169  if ((fd=fopen(outfile,"w")) == NULL)
170  {
171  cerr << "scfg_make: failed to open file \"" << outfile <<
172  "\" for writing" << endl;
173  exit(1);
174  }
175  }
176 
177  for (r=rules; r != NIL; r=cdr(r))
178  pprint_to_fd(fd,car(r));
179 
180  if (fd != stdout)
181  fclose(fd);
182 
183 
184  return 0;
185 }
186 
187 static LISP make_all_rules(const EST_StrList &NonTerminals,
188  const EST_StrList &Terminals)
189 {
190  // Build all possibly rules (CNF)
191  // NT -> NT NT and NT -> T
192  EST_Litem *p,*q,*r;
193  LISP rules = NIL;
194 
195  for (p=NonTerminals.head(); p != 0; p=p->next())
196  {
197  int num_rules_nt = (NonTerminals.length()*NonTerminals.length())+
198  Terminals.length();
199  double *probs = new double[num_rules_nt];
200  generate_probs(probs,num_rules_nt);
201  int i=0;
202  for (q=NonTerminals.head(); q != 0; q=q->next())
203  for (r=NonTerminals.head(); r != 0; r=r->next(),i++)
204  rules = cons(cons(flocons(probs[i]),
205  cons(rintern(NonTerminals(p)),
206  cons(rintern(NonTerminals(q)),
207  cons(rintern(NonTerminals(r)),NIL)))),
208  rules);
209  for (q=Terminals.head(); q != 0; q=q->next(),i++)
210  rules = cons(cons(flocons(probs[i]),
211  cons(rintern(NonTerminals(p)),
212  cons(rintern(Terminals(q)),NIL))),
213  rules);
214  delete [] probs;
215  }
216 
217  return reverse(rules);
218 }
219 
220 static void generate_probs(double *probs,int num)
221 {
222  // Generate probabilities
223  int i;
224 
225  if (values == "equal")
226  {
227  double defp = 1.0/(float)num;
228  for (i=0; i < num; i++)
229  probs[i] = defp;
230  }
231  else if (values == "random")
232  {
233  // This isn't random but is somewhat arbitrary
234  double sum = 0;
235  for (i=0; i < num; i++)
236  {
237  probs[i] = (double)abs(rand())/(double)0x7fff;
238  sum += probs[i];
239  }
240  for (i=0; i < num; i++)
241  {
242  probs[i] /= sum;
243  }
244  }
245  else
246  {
247  cerr << "scfg_make: unknown value for probability distribution"
248  << endl;
249  exit(1);
250  }
251 }
252 
253 static LISP assign_probs(LISP rules, const EST_String &domain,
254  const EST_String &values)
255 {
256  // Modify probs (don't know how to do random probs yet)
257  LISP r;
258  (void)values;
259 
260  if (domain == "nlogp")
261  for (r=rules; r != NIL; r = cdr(r))
262  if (get_c_float(car(car(r))) == 0)
263  CAR(car(r)) = flocons(40);
264  else
265  CAR(car(r)) = flocons(-log(get_c_float(car(car(r)))));
266 
267  return rules;
268 }
269 
270 static void make_symbols(EST_StrList &syms,int n,const EST_String &prefix)
271 {
272  // Generate n symbols with given prefix
273  int i;
274  int magnitude,t;
275 
276  for (magnitude=0,t=n; t > 0; t=t/10)
277  magnitude++;
278 
279  char *name = walloc(char,prefix.length()+magnitude+1);
280  char *skel = walloc(char,prefix.length()+5);
281  sprintf(skel,"%s%%%02dd",(const char *)prefix,magnitude);
282 
283  for (i=0; i < n; i++)
284  {
285  sprintf(name,skel,i);
286  syms.append(name);
287  }
288 
289  wfree(name);
290  wfree(skel);
291 
292 }
293 
294 
295 static void load_symbols(EST_StrList &syms,const EST_String &filename)
296 {
297  // Load symbol list for file
298 
299  load_StrList(filename,syms);
300 
301 }
int ival(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:76
const int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
int length(void) const
Length of string ({not} length of underlying chunk)
Definition: EST_String.h:244
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:198
EST_read_status load_StrList(EST_String filename, EST_StrList &l)
Load tokens from a file and return them in a EST_StrList.
int matches(const char *e, int pos=0) const
Exactly match this string?
Definition: EST_String.cc:652