Edinburgh Speech Tools  2.4-release
ngrammar_io.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* IO functions for EST_Ngram class */
38 /* */
39 /*=======================================================================*/
40 
41 #include <cstdlib>
42 #include <fstream>
43 #include <iostream>
44 #include "EST_unix.h"
45 #include <cstring>
46 #include <climits>
47 #include <cfloat>
48 #include "EST_String.h"
49 #include "EST_Ngrammar.h"
50 #include "EST_Token.h"
51 #include "EST_cutils.h"
52 
53 EST_read_status
54 load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n)
55 {
56  (void)filename;
57  (void)n;
58  return wrong_format;
59 }
60 
61 EST_read_status
62 load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
63 {
64  (void)filename;
65  (void)n;
66  return wrong_format;
67 }
68 
69 EST_read_status
70 load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
71 {
72 
73  EST_TokenStream ts;
74  EST_String s;
75  int i,j,k, order=0;
76  double occur,weight;
77  int this_num,this_order;
78 
79  if (ts.open(filename) == -1)
80  return misc_read_error;
81 
82  // find backslash data backslash
83  while ((!ts.eof()) && !ts.get().string().contains("\\data\\"));
84 
85  if (ts.eof())
86  {
87  ts.close();
88  return wrong_format;
89  }
90 
91  // find order and numbers of ngrams
92 
93  // somewhere to keep numbers
94  EST_IVector nums(100); // not going to have anything bigger than a 100-gram !
95 
96  while (!ts.eof())
97  {
98  // have we got to next section
99  if (ts.peek().string().contains("-grams:"))
100  break;
101 
102  s=ts.get_upto_eoln().string();
103 
104  if(s.contains("ngram ") && s.contains("="))
105  {
106 
107  s=s.after("ngram ");
108  this_order=atoi(s.before("="));
109  this_num=atoi(s.after("="));
110 
111  //cerr << "There are " << this_num << " " << this_order
112  //<< "-grams" << endl;
113 
114  nums[this_order] = this_num;
115 
116  if(this_order > order)
117  order = this_order;
118  }
119 
120  }
121 
122 
123  if(order==0)
124  {
125  //cerr << "No ngram ?=? in header !" << endl;
126  ts.close();
127  return wrong_format;
128  }
129 
130  //cerr << "Initialising " << order << "-grammar" << endl;
131  if(!n.init(order,EST_Ngrammar::backoff,vocab))
132  return misc_read_error;
133 
134  // read data
135  for(i=1;i<=order;i++)
136  {
137 
138  EST_StrVector window(i);
139 
140  // find start of data for this order "<order>-grams:"
141  EST_String tmp = "\\" + itoString(i) + "-grams:";
142  while (!ts.eof())
143  {
144  s=ts.get().string();
145  if (s.contains(tmp))
146  break;
147  }
148 
149 
150  if(ts.eof())
151  {
152  cerr << "Unexpected end of grammar file whilst looking for '"
153  << tmp << "'" << endl;
154  return misc_read_error;
155  }
156 
157  //cerr << "Found order " << i << " : " << tmp << endl;
158  //cerr << "Looking for " << nums(i) << " ngrams" << endl;
159  // look for nums(i) ngrams
160 
161  for(j=0;j<nums(i);j++)
162  {
163 
164  for (k=0; ((k<i) && !ts.eof()); k++)
165  window[k] = ts.get().string();
166 
167  if(ts.eof())
168  {
169  cerr << "Unexpected end of file whilst reading " << i
170  << "-grams !" << endl;
171  return misc_read_error;
172  }
173 
174  // can't for backoff grammars, need to set probs directly
175 
176  cerr << "ooooooooops" << endl;
177  return wrong_format;
178 
179  occur = atof(ts.get().string());
180  n.accumulate(window,occur);
181 
182  // backoff weight ?
183  if (!ts.eoln())
184  {
185  weight = atof(ts.get().string());
186  n.set_backoff_weight(window,weight);
187  }
188 
189  if (!ts.eoln())
190  {
191  cerr << "EST_Ngrammar:load_ngram_arpa expect end of line at filepos "
192  << ts.filepos() << endl;
193  ts.close();
194  return misc_read_error;
195  }
196  }
197 
198  } // loop through orders
199 
200 
201  // find backslash end backslash
202  while (!ts.eof())
203  if (ts.get().string() == "\\end\\")
204  {
205  ts.close();
206  return format_ok;
207 
208  }
209 
210  cerr << "Missing \\end\\ !" << endl;
211 
212  ts.close();
213  return misc_read_error;
214 
215 }
216 
217 EST_read_status
218 load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
219 {
220  EST_TokenStream ts;
221  int i, order;
222  double occur;
223 
224  if (ts.open(filename) == -1)
225  return misc_read_error;
226 
227  if (ts.peek().string() != "Ngram_2")
228  {
229  ts.close();
230  return wrong_format;
231  }
232  ts.get(); // skip magic number
233 
234  order = atoi(ts.get().string());
235  ts.get_upto_eoln(); // skip to next line
236  EST_StrList vocab;
237  EST_StrList pred_vocab; // may be different
238 
239  while (!ts.eoln())
240  vocab.append(ts.get().string());
241  ts.get_upto_eoln(); // skip to next line
242  while (!ts.eoln())
243  pred_vocab.append(ts.get().string());
244 
245  if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
246  {
247  cerr << "Something may be wrong with the vocab lists in '"
248  << filename << "'" << endl;
249  return misc_read_error;
250  }
251 
252  EST_StrVector window(order);
253 
254  while(!ts.eof())
255  {
256  for (i=0; i < order; i++)
257  window[i] = ts.get().string();
258  if (ts.get().string() != ":")
259  {
260  cerr << "EST_Ngrammar:load_ngram_cstr_ascii missing colon at filepos "
261  << ts.filepos() << endl;
262  return misc_read_error;
263  }
264  occur = atof(ts.get().string());
265  n.accumulate(window,occur);
266  if (!ts.eoln())
267  {
268  cerr << "EST_Ngrammar:load_ngram_cstr_ascii expect end of line at filepos "
269  << ts.filepos() << endl;
270  return misc_read_error;
271  }
272  }
273 
274  ts.close();
275 
276  return format_ok;
277 }
278 
279 EST_read_status
280 load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
281 {
282  EST_TokenStream ts;
283  int i,j,order;
284  EST_Litem *k;
285  int num_entries;
286  double approx_num_samples = 0.0;
287  long freq_data_start, freq_data_end;
288  FILE *ifd;
289  int magic = 0;
290  int swap = FALSE;
291 
292  if ((ifd=fopen(filename,"rb")) == NULL)
293  return misc_read_error;
294  fread(&magic,sizeof(int),1,ifd);
295 
296  if (SWAPINT(magic) == EST_NGRAMBIN_MAGIC)
297  swap = TRUE;
298  else if (magic != EST_NGRAMBIN_MAGIC)
299  return wrong_format;
300  if (ts.open(ifd, FALSE) == -1)
301  return misc_read_error;
302 
303  ts.set_SingleCharSymbols("\n");
304  ts.set_WhiteSpaceChars(" \t\r");
305 
306  if (ts.peek().string() != "mBin_2")
307  {
308  fclose(ifd);
309  ts.close();
310  return wrong_format;
311  }
312  ts.get(); // skip magic number
313 
314  order = atoi(ts.get().string());
315  if (ts.get() != "\n")
316  {
317  fclose(ifd);
318  ts.close();
319  return misc_read_error;
320  }
321  EST_StrList vocab;
322  EST_StrList pred_vocab; // may be different
323 
324  while ((ts.peek() != "\n") && (!ts.eof()))
325  vocab.append(ts.get().string());
326  ts.get(); // skip newline
327  while ((ts.peek() != "\n") && (!ts.eof()))
328  pred_vocab.append(ts.get().string());
329 
330  // Need to get to the position one after the newline and
331  // who knows what TokenStream has already read,
332  fseek(ifd,(long)(ts.peek().filepos()+5),SEEK_SET);
333 
334  if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
335  {
336  ts.close();
337  fclose(ifd);
338  return misc_read_error;
339  }
340 
341  EST_StrVector window(order);
342 
343  freq_data_start = ftell(ifd);
344  fseek(ifd,0,SEEK_END);
345  freq_data_end = ftell(ifd);
346  num_entries = (freq_data_end-freq_data_start)/sizeof(double);
347  double *dd = new double[num_entries];
348 
349  // Go back to start of data
350  fseek(ifd,freq_data_start,SEEK_SET);
351 
352  if (fread(dd,sizeof(double),num_entries,ifd) != (unsigned)num_entries)
353  {
354  cerr << "EST_Ngrammar::load_ngram_cstr_bin format does not have expected number of entries" << endl;
355  ts.close();
356  fclose(ifd);
357  return misc_read_error;
358  }
359  if (swap)
360  swap_bytes_double(dd,num_entries);
361 
362  for(j=i=0;i<n.num_states();i++)
363  {
364  if (j >= num_entries)
365  {
366  cerr << "EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" << endl;
367  ts.close();
368  fclose(ifd);
369  return misc_read_error;
370  }
371  for (k=n.p_states[i].pdf().item_start();
372  (!n.p_states[i].pdf().item_end(k)) && (j < num_entries) ;
373  k = n.p_states[i].pdf().item_next(k))
374  {
375  n.p_states[i].pdf().set_frequency(k,dd[j]);
376  // Update global info too
377  approx_num_samples += dd[j]; // probably not right
378  n.vocab_pdf.cumulate(k,dd[j]);
379 
380  // Number of consecutive occurrences of this frequency as in
381  // dd[j+1] if its a negative number
382  if (j+1 >= num_entries)
383  j++;
384  else if (dd[j+1] < -1)
385  dd[j+1]++;
386  else if (dd[j+1] == -1)
387  j +=2;
388  else
389  j++;
390  }
391  }
392 
393  // With smoothing num_samples might not be as exact as you like
394  n.p_num_samples = (int)approx_num_samples;
395 
396  delete [] dd;
397 
398  ts.close();
399  fclose(ifd);
400 
401  return format_ok;
402 }
403 
404 // ====================================================================
405 
406 EST_write_status
407 save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost,
408  EST_Ngrammar &n, double floor)
409 {
410  EST_Litem *k;
411  EST_String name;
412  double freq;
413  EST_StrVector this_ngram(2); // assumes bigram
414  this_ngram[0] = word;
416  this_pdf = n.prob_dist(this_ngram);
417 
418  double lfreq=-1;
419  int lcount=0;
420  double total_freq=0;
421 
422  double floor_prob_total = floor * (n.pred_vocab->length()-1);
423 
424  if (word == n.p_sentence_end_marker)
425  {
426  *ost << word;
427  *ost << " 0*" << n.pred_vocab->length()-1 << " " << 1 << endl;
428  return write_ok;
429  }
430 
431  if(floor_prob_total > 1)
432  {
433  cerr << "ERROR : floor is impossibly large, scaling it !" << endl;
434  floor = 1.0 / (double)(n.pred_vocab->length()-1);
435  floor_prob_total = 1;
436  }
437 
438  // not efficient but who cares ?
439  for (k=this_pdf.item_start();
440  !this_pdf.item_end(k);
441  k = this_pdf.item_next(k))
442  {
443  this_pdf.item_freq(k,name,freq);
444  if(name != n.p_sentence_start_marker)
445  {
446  total_freq += freq;
447  }
448  }
449 
450 
451  // 0 for prob(word,start marker)
452  *ost << word << " 0 ";
453 
454  if (total_freq <= 0)
455  {
456  *ost << 1.0 / (double)(n.pred_vocab->length()-1) << "*";
457  *ost << n.pred_vocab->length()-1 << " " << endl;
458  }
459  else
460  {
461  lfreq=-1;
462 
463  for (k=this_pdf.item_start();
464  !this_pdf.item_end(k);
465  k = this_pdf.item_next(k))
466  {
467  this_pdf.item_freq(k,name,freq);
468 
469  if ( (name == n.p_sentence_start_marker) ||
470  (name == n.p_sentence_end_marker) ||
471  (name == OOV_MARKER) )
472  continue;
473 
474  if (freq == lfreq)
475  lcount++;
476  else
477  {
478  if (lcount > 1)
479  *ost << "*" << lcount << " ";
480  else
481  *ost << " ";
482 
483  lcount=1;
484  lfreq = freq;
485 
486  if(freq > 0)
487  {
488  double base_prob = freq / total_freq;
489 
490  // and floor/scale it
491  *ost << floor + ( base_prob * (1-floor_prob_total) );
492 
493  }
494  else
495  *ost << floor;
496 
497  }
498 
499 
500  }
501 
502  } // total_freq > 0
503 
504 
505  if(!n.closed_vocab())
506  {
507 
508  // not fully tested !!!!!!!!
509 
510  *ost << 0 << " ERROR !!!!!!!! ";
511  }
512 
513 
514  if (total_freq > 0)
515  {
516  freq = this_pdf.frequency(n.p_sentence_end_marker);
517 
518  if(freq == lfreq)
519  {
520  lcount++;
521  *ost << "*" << lcount << " " << endl;
522  }
523  else
524  {
525 
526  if (lcount > 1)
527  *ost << "*" << lcount << " ";
528  else
529  *ost << " ";
530 
531  if(freq > 0)
532  {
533  double base_prob = freq / total_freq;
534 
535  // and floor/scale it
536  *ost << floor + ( base_prob * (1-floor_prob_total) ) << endl;
537 
538  }
539  else
540  *ost << floor << endl;
541  }
542  }
543 
544  return write_ok;
545 }
546 
547 EST_write_status
548 save_ngram_htk_ascii(const EST_String filename,
549  EST_Ngrammar &n, double floor)
550 {
551 
552  ostream *ost;
553 
554  // only for bigram
555  if(n.order() != 2)
556  {
557  cerr << "Can only save bigrams in htk_ascii format" << endl;
558  return misc_write_error;
559  }
560 
561  if (floor < 0)
562  {
563  cerr << "Negative floor probability does not make sense !" << endl;
564  return misc_write_error;
565  }
566 
567  if (filename == "-")
568  ost = &cout;
569  else
570  ost = new ofstream(filename);
571 
572  if(!(*ost))
573  return write_fail;
574 
575  if(floor * (n.pred_vocab->length()-1) > 1)
576  {
577  floor = 1.0 / (double)(n.pred_vocab->length()-1);
578  cerr << "ERROR : floor is impossibly large, scaling it to ";
579  cerr << floor << endl;
580  }
581 
582  int i;
583 
584  if(n.p_sentence_start_marker == "")
585  {
586  cerr << "Can't save in HTK format as no sentence start/end tags"
587  << " were given !" << endl;
588  return misc_write_error;
589  }
590 
591  // need '!ENTER' (or whatever) as first word- that's HTK for you
592  save_ngram_htk_ascii_sub(n.p_sentence_start_marker,ost,n,floor);
593 
594  // the real words
595  for(i=0;i<n.vocab->length();i++)
596  {
597  if ( (n.vocab->name(i) != n.p_sentence_start_marker) &&
598  (n.vocab->name(i) != n.p_sentence_end_marker) &&
599  (n.vocab->name(i) != OOV_MARKER) )
600  save_ngram_htk_ascii_sub(n.vocab->name(i),ost,n,floor);
601  }
602 
603  if(!n.closed_vocab())
604  save_ngram_htk_ascii_sub(OOV_MARKER,ost,n,floor);
605 
606  save_ngram_htk_ascii_sub(n.p_sentence_end_marker,ost,n,floor);
607 
608  if(ost != &cout)
609  delete ost;
610 
611  return write_ok;
612 }
613 
614 /*
615  EST_write_status
616  save_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
617  {
618  return write_ok;
619  }
620  */
621 
622 void
623 count_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *count)
624 {
625  if(n->ngram_exists(ngram))
626  *((double*)count) += 1;
627 }
628 
629 void
630 save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost)
631 {
632 
633  int i;
634 
635  if(n->ngram_exists(ngram))
636  {
637  *((ostream*)(ost)) << safe_log10(n->probability(ngram)) << " ";
638  for(i=0;i<ngram.n();i++)
639  *((ostream*)(ost)) << ngram(i) << " ";
640 
641  if ((n->representation() == EST_Ngrammar::backoff) &&
642  (n->order() > ngram.n()) )
643  *((ostream*)(ost)) << safe_log10(n->get_backoff_weight(ngram));
644  //<< " = "
645  //<< n->get_backoff_weight(ngram) << " ";
646 
647  *((ostream*)(ost)) << endl;
648 
649  }
650 }
651 
652 EST_write_status
653 save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
654 {
655  // ARPA MIT-LL format - see HTK manual !!
656 
657  ostream *ost;
658  int i,o;
659 
660  if (filename == "-")
661  ost = &cout;
662  else
663  ost = new ofstream(filename);
664 
665  if (!(*ost))
666  return write_fail;
667 
668  //n.set_entry_type(EST_Ngrammar::probabilities);
669  //n.make_htk_compatible(); // fix enter/exit probs
670  //*ost << *(n.vocab) << endl;
671 
672  *ost << "\\data\\" << endl;
673 
674  double *count = new double;
675 
676  if (n.representation() == EST_Ngrammar::backoff)
677  {
678  for(o=1;o<=n.order();o++)
679  {
680  EST_StrVector ngram(o);
681  for(i=0;i<o;i++)
682  ngram[i] = "";
683  *count =0;
684 
685  // this is a deeply silly way to count them,
686  // we could traverse the tree directly !
687  n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
688  *ost << "ngram " << o << "=" << *count << endl;
689  }
690 
691  for(o=1;o<=n.order();o++)
692  {
693  *ost << endl;
694  *ost << "\\" << o << "-grams:" << endl;
695  EST_StrVector ngram(o);
696  for(i=0;i<o;i++)
697  ngram[i] = "";
698  n.iterate(ngram,&save_ngram_arpa_sub,(void*)ost);
699  }
700 
701  }
702  else
703  {
704  EST_StrVector ngram(n.order());
705  for(i=0;i<n.order();i++)
706  ngram[i] = "";
707  *count =0;
708  n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
709  *ost << "ngram " << n.order() << "=" << *count << endl;
710 
711  *ost << endl;
712  *ost << "\\" << n.order() << "-grams:" << endl;
713 
714  for(i=0;i<n.order();i++)
715  ngram[i] = "";
716  n.iterate(ngram,&save_ngram_arpa_sub,ost);
717 
718  }
719 
720  *ost << "\\end\\" << endl;
721 
722  if (ost != &cout)
723  delete ost;
724 
725  return write_ok;
726 }
727 
728 EST_write_status
729 save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n,
730  const bool trace, double floor)
731 {
732  // awb's format
733  (void)trace;
734  ostream *ost;
735  int i;
736  EST_Litem *k;
737 
738  if (filename == "-")
739  ost = &cout;
740  else
741  ost = new ofstream(filename);
742 
743  if(!(*ost))
744  return write_fail;
745 
746  *ost << "Ngram_2 " << n.order() << endl;
747  for (i=0; i < n.vocab->length(); i++)
748  *ost << n.vocab->name(i) << " ";
749  *ost << endl;
750  for (i=0; i < n.pred_vocab->length(); i++)
751  *ost << n.pred_vocab->name(i) << " ";
752  *ost << endl;
753 
754  if (n.representation() == EST_Ngrammar::dense)
755  n.print_freqs(*ost,floor);
756  else if (n.representation() == EST_Ngrammar::backoff)
757  {
758  int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));
759 
760  for(i=0;i<total_ngrams;i++)
761  {
763  const EST_StrVector this_ngram = n.make_ngram_from_index(i);
764  this_pdf = n.prob_dist(this_ngram);
765 
766  for (k=this_pdf.item_start();
767  !this_pdf.item_end(k);
768  k = this_pdf.item_next(k))
769  {
770  double freq;
771  EST_String name;
772  this_pdf.item_freq(k,name,freq);
773 
774  for (int jj=0; jj < this_ngram.n(); jj++)
775  *ost << this_ngram(jj) << " ";
776  *ost << name << " : " << freq << endl;
777  }
778  }
779  }
780 
781  if(ost != &cout)
782  delete ost;
783 
784  return write_ok;
785 }
786 
787 EST_write_status
788 save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
789 {
790  // Save as a WFST
791  FILE *ost;
792  int i;
793 
794  if ((ost = fopen(filename,"wb")) == NULL)
795  {
796  cerr << "Ngrammar save: unable to open \"" << filename <<
797  "\" for writing" << endl;
798  return write_fail;
799  }
800 
801  fprintf(ost,"EST_File fst\n");
802  fprintf(ost,"DataType ascii\n");
803  fprintf(ost,"in \"(");
804  for (i=0; i < n.vocab->length(); i++)
805  fprintf(ost," %s\n",(const char *)n.vocab->name(i));
806  fprintf(ost," )\"\n");
807  fprintf(ost,"out \"(");
808  for (i=0; i < n.vocab->length(); i++)
809  fprintf(ost," %s\n",(const char *)n.vocab->name(i));
810  fprintf(ost," )\"\n");
811  fprintf(ost,"NumStates %d\n",n.num_states());
812  fprintf(ost,"EST_Header_End\n");
813 
814  for (i=0; i<n.num_states(); i++)
815  {
816  fprintf(ost,"((%d nonfinal %d)\n",i,i);
817  fprintf(ost,")\n");
818  }
819 
820  fclose(ost);
821 
822  return write_ok;
823 }
824 
825 EST_write_status
826 save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n,
827  const bool trace, double floor)
828 {
829 
830  if (n.representation() == EST_Ngrammar::sparse)
831  return misc_write_error;
832 
833  int i;
834  EST_Litem *k;
835  FILE *ofd;
836  double lfreq = -1;
837  double count = -1;
838  int magic = EST_NGRAMBIN_MAGIC;
839 
840  if (filename == "-")
841  {
842  if ((ofd=stdout) == NULL)
843  return misc_write_error;
844  }
845  else
846  {
847  if ((ofd=fopen(filename,"wb")) == NULL)
848  return misc_write_error;
849  }
850 
851  fwrite(&magic,sizeof(int),1,ofd);
852  fprintf(ofd,"mBin_2 %d\n",n.order());
853  for (i=0; i < n.vocab->length(); i++)
854  fprintf(ofd,"%s ",(const char *)n.vocab->name(i));
855  fprintf(ofd,"\n");
856  for (i=0; i < n.pred_vocab->length(); i++)
857  fprintf(ofd,"%s ",(const char *)n.pred_vocab->name(i));
858  fprintf(ofd,"\n");
859 
860  // We use a simple form of run-length encoding, if consecutive
861  // values are equal only a length is printed. lengths are
862  // negative as frequencies (even smoothed ones) can never be -ve
863 
864  if ( trace )
865  cerr << "Saving ..." << endl;
866 
867  if (n.representation() == EST_Ngrammar::dense)
868  {
869  for(i=0;i<n.num_states();i++)
870  {
871 
872  if ( trace )
873  cerr << "\r" << i*100/n.num_states() << "%";
874 
875  for (k=n.p_states[i].pdf().item_start();
876  !n.p_states[i].pdf().item_end(k);
877  k = n.p_states[i].pdf().item_next(k))
878  {
879  double freq;
880  EST_String name;
881  n.p_states[i].pdf().item_freq(k,name,freq);
882  if (freq == 0.0)
883  freq = floor;
884  if (freq == lfreq)
885  count--;
886  else
887  {
888  if (count < -1)
889  fwrite(&count,sizeof(double),1,ofd);
890  fwrite(&freq,sizeof(double),1,ofd);
891  count = -1;
892  }
893  lfreq = freq;
894  }
895  }
896  if (count < -1)
897  fwrite(&count,sizeof(double),1,ofd);
898  }
899  else if (n.representation() == EST_Ngrammar::backoff)
900  {
901  // need to construct pdfs in right order
902  // noting that dense states are indexed s.t. the last
903  // word in the ngram is the least significant 'bit'
904 
905  // number of ngrams, excluding last word, is
906  int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));
907 
908  for(i=0;i<total_ngrams;i++)
909  {
910 
911  if ( trace )
912  cerr << "\r" << i*100/total_ngrams << "%";
913 
915  const EST_StrVector this_ngram = n.make_ngram_from_index(i);
916  this_pdf = n.prob_dist(this_ngram);
917 
918  for (k=this_pdf.item_start();
919  !this_pdf.item_end(k);
920  k = this_pdf.item_next(k))
921  {
922 
923  double freq;
924  EST_String name;
925  this_pdf.item_freq(k,name,freq);
926  if (freq == lfreq)
927  count--;
928  else
929  {
930  if (count < -1)
931  fwrite(&count,sizeof(double),1,ofd);
932  fwrite(&freq,sizeof(double),1,ofd);
933  count = -1;
934  }
935  lfreq = freq;
936  }
937 
938 
939  }
940 
941  }
942  if ( trace )
943  cerr << "\r \r" << endl;
944 
945  fclose(ofd);
946 
947  return write_ok;
948 }
EST_TokenStream::eof
int eof()
end of file
Definition: EST_Token.h:356
EST_Token::filepos
int filepos(void) const
file position in original \Ref{EST_TokenStream}.
Definition: EST_Token.h:186
EST_TList
Definition: EST_TList.h:109
EST_Discrete::name
const EST_String & name(const int n) const
The name given the index.
Definition: EST_simplestats.h:94
EST_DiscreteProbDistribution::item_start
EST_Litem * item_start() const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:370
EST_TokenStream
Definition: EST_Token.h:235
EST_TList::append
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:191
EST_String::contains
int contains(const char *s, int pos=-1) const
Does it contain this substring?
Definition: EST_String.h:375
EST_TokenStream::filepos
int filepos(void) const
current file position in \Ref{EST_TokenStream}
Definition: EST_Token.h:361
EST_DiscreteProbDistribution::cumulate
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
Definition: EST_DProbDist.cc:157
EST_Discrete::length
const int length(void) const
The number of members in the discrete.
Definition: EST_simplestats.h:84
EST_TokenStream::get_upto_eoln
EST_Token get_upto_eoln(void)
get up to {\tt s} in end of line as a single token.
Definition: EST_Token.cc:516
EST_TokenStream::set_SingleCharSymbols
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:338
EST_TokenStream::close
void close(void)
Close stream.
Definition: EST_Token.cc:406
EST_DiscreteProbDistribution::item_next
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:386
EST_DiscreteProbDistribution
Definition: EST_simplestats.h:210
EST_TokenStream::eoln
int eoln()
end of line
Definition: EST_Token.cc:818
EST_TokenStream::open
int open(const EST_String &filename)
open a \Ref{EST_TokenStream} for a file.
Definition: EST_Token.cc:200
EST_UItem
Definition: EST_UList.h:51
EST_TSimpleVector< int >
EST_TokenStream::get
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:486
EST_Ngrammar
Definition: EST_Ngrammar.h:209
EST_String
Definition: EST_String.h:70
EST_DiscreteProbDistribution::set_frequency
void set_frequency(const EST_String &s, double c)
Definition: EST_DProbDist.cc:268
EST_TokenStream::set_WhiteSpaceChars
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
Definition: EST_Token.h:335
EST_String::before
EST_String before(int pos, int len=0) const
Part before position.
Definition: EST_String.h:286
EST_DiscreteProbDistribution::item_end
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:378
EST_String::after
EST_String after(int pos, int len=1) const
Part after pos+len.
Definition: EST_String.h:318
EST_TVector::n
INLINE int n() const
number of items in vector.
Definition: EST_TVector.h:254
EST_DiscreteProbDistribution::item_freq
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
Definition: EST_DProbDist.cc:402
EST_TVector< EST_String >
EST_TokenStream::peek
EST_Token & peek(void)
peek at next token
Definition: EST_Token.cc:830