@@ -108,13 +108,9 @@ void ArpaLm::read() {
108108 std::string tempfile;
109109
110110 if (paramReverseLm (config)) {
111- char * tmp = tempnam (0 , " lm" );
112- tempfile = tmp;
113- ::free (tmp);
114- log () << " reversing ARPA language model into temporary file '" << tempfile << " '" ;
115- Lm::reverseArpaLm (filename, tempfile);
116- log () << " successfully reversed" ;
117- filename = tempfile;
111+ log () << " reversing ARPA language model" ;
112+ filename = Lm::reverseArpaLm (filename);
113+ log () << " successfully reversed into temporary file '" << filename << " '" ;
118114 }
119115
120116 Core::CompressedInputStream* cis = new Core::CompressedInputStream (filename.c_str ());
@@ -132,14 +128,17 @@ void ArpaLm::read() {
132128 Core::MD5 md5;
133129 std::string line;
134130 u32 lineNumber = 0 , totalNGrams = 0 , expectedTotalNGrams = 0 , maxTotalNGrams = 0 ;
135- enum { preamble,
136- sizes,
137- ngrams,
138- postamble,
139- unknown } state = preamble;
140- u32 nGram, n;
141- Token tokens[maxNGramLength];
142- Core::ProgressIndicator pi (" reading ARPA lm" , " n-grams" );
131+ enum {
132+ preamble,
133+ sizes,
134+ ngrams,
135+ postamble,
136+ unknown
137+ } state = preamble;
138+ u32 nGram, n;
139+ Token tokens[maxNGramLength];
140+ std::unordered_set<std::string> unknownSyntacticTokenMap;
141+ Core::ProgressIndicator pi (" reading ARPA lm" , " n-grams" );
143142 pi.start ();
144143 while (!std::getline (is, line).eof ()) {
145144 ++lineNumber;
@@ -205,7 +204,10 @@ void ArpaLm::read() {
205204 tokens[n] = t;
206205 }
207206 else {
208- warning (" unknown syntactic token '%s' in line %d" , word.c_str (), lineNumber);
207+ if (unknownSyntacticTokenMap.find (word) == unknownSyntacticTokenMap.end ()) {
208+ unknownSyntacticTokenMap.insert (word);
209+ warning (" unknown syntactic token '%s' in line %d" , word.c_str (), lineNumber);
210+ }
209211 break ;
210212 }
211213 }
@@ -224,6 +226,7 @@ void ArpaLm::read() {
224226 }
225227 }
226228 pi.finish ();
229+ log (" Unknown syntactic tokens in total: %lu" , unknownSyntacticTokenMap.size ());
227230 if (state != postamble)
228231 error (" Premature end of language model file." );
229232 dependency_.setValue (md5);
@@ -232,9 +235,6 @@ void ArpaLm::read() {
232235 */
233236 initialize (&*data->items .begin (), &(*(data->items .end () - 1 )) + 1 );
234237 delete data;
235-
236- if (tempfile.size ())
237- std::remove (tempfile.c_str ());
238238}
239239
240240ArpaClassLm::ArpaClassLm (const Core::Configuration& c, Bliss::LexiconRef l)
0 commit comments