Skip to content

Commit f0c58ed

Browse files
authored
Backport AppTek changes to LM + LM lookahead code (#74)
1 parent f4ed97e commit f0c58ed

21 files changed

+1031
-618
lines changed

src/Bliss/SyntacticTokenMap.hh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,16 @@ public:
6262
verify_(s->id() < store_.size());
6363
return store_[s->id()];
6464
}
65+
66+
Value& operator[](const Token::Id id) {
67+
verify_(id < store_.size());
68+
return store_[id];
69+
}
70+
71+
const Value& operator[](const Token::Id id) const {
72+
verify_(id < store_.size());
73+
return store_[id];
74+
}
6575
};
6676

6777
} // namespace Bliss

src/Lm/AbstractNNLanguageModel.cc

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,14 @@ Core::ParameterString AbstractNNLanguageModel::paramVocabUnknownWord(
2727
"vocab-unknown-word", "the word from the provided vocabulary file that will serve as unknown token", "");
2828

2929
AbstractNNLanguageModel::AbstractNNLanguageModel(Core::Configuration const& c, Bliss::LexiconRef l)
30-
: Core::Component(c), Precursor(c, l), collect_statistics_(paramCollectStatistics(c)), vocab_file_(paramVocabularyFile(c)), unknown_word_(paramVocabUnknownWord(config)), lexicon_(l), num_outputs_(0ul), lexicon_mapping_(), usage_histogram_() {
30+
: Core::Component(c),
31+
Precursor(c, l),
32+
collect_statistics_(paramCollectStatistics(c)),
33+
vocab_file_(paramVocabularyFile(c)),
34+
unknown_word_(paramVocabUnknownWord(config)),
35+
lexicon_(l),
36+
num_outputs_(0ul),
37+
usage_histogram_() {
3138
NNHistoryManager* hm = new NNHistoryManager();
3239
if (collect_statistics_) {
3340
hm->setOnReleaseHandler(std::bind(&AbstractNNLanguageModel::onRelease, this, std::placeholders::_1));
@@ -89,7 +96,7 @@ void AbstractNNLanguageModel::onRelease(HistoryHandle handle) {
8996
NNCacheWithStats const* c = reinterpret_cast<NNCacheWithStats const*>(handle);
9097
if (not c->output_used.empty()) {
9198
unsigned used_outputs = std::accumulate(c->output_used.begin(), c->output_used.end(),
92-
0u, [](unsigned sum, bool used) { return sum + (used ? 1u : 0u); });
99+
0u, [](unsigned sum, bool used) { return sum + (used ? 1u : 0u); });
93100
size_t promille_used = static_cast<size_t>((1000.0 * used_outputs) / c->output_used.size());
94101
if (usage_histogram_.size() <= promille_used) {
95102
usage_histogram_.resize(promille_used + 1ul);

src/Lm/AbstractNNLanguageModel.hh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,9 @@ protected:
4040
std::string vocab_file_;
4141
std::string unknown_word_;
4242

43-
Bliss::LexiconRef lexicon_;
44-
size_t num_outputs_;
45-
std::vector<size_t> lexicon_mapping_;
46-
std::vector<u32> usage_histogram_;
43+
Bliss::LexiconRef lexicon_;
44+
size_t num_outputs_;
45+
std::vector<u32> usage_histogram_;
4746

4847
void loadVocabulary();
4948
void useOutput(NNCacheWithStats const& cache, size_t idx) const;

src/Lm/ArpaLm.cc

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -108,13 +108,9 @@ void ArpaLm::read() {
108108
std::string tempfile;
109109

110110
if (paramReverseLm(config)) {
111-
char* tmp = tempnam(0, "lm");
112-
tempfile = tmp;
113-
::free(tmp);
114-
log() << "reversing ARPA language model into temporary file '" << tempfile << "'";
115-
Lm::reverseArpaLm(filename, tempfile);
116-
log() << "successfully reversed";
117-
filename = tempfile;
111+
log() << "reversing ARPA language model";
112+
filename = Lm::reverseArpaLm(filename);
113+
log() << "successfully reversed into temporary file '" << filename << "'";
118114
}
119115

120116
Core::CompressedInputStream* cis = new Core::CompressedInputStream(filename.c_str());
@@ -132,14 +128,17 @@ void ArpaLm::read() {
132128
Core::MD5 md5;
133129
std::string line;
134130
u32 lineNumber = 0, totalNGrams = 0, expectedTotalNGrams = 0, maxTotalNGrams = 0;
135-
enum { preamble,
136-
sizes,
137-
ngrams,
138-
postamble,
139-
unknown } state = preamble;
140-
u32 nGram, n;
141-
Token tokens[maxNGramLength];
142-
Core::ProgressIndicator pi("reading ARPA lm", "n-grams");
131+
enum {
132+
preamble,
133+
sizes,
134+
ngrams,
135+
postamble,
136+
unknown
137+
} state = preamble;
138+
u32 nGram, n;
139+
Token tokens[maxNGramLength];
140+
std::unordered_set<std::string> unknownSyntacticTokenMap;
141+
Core::ProgressIndicator pi("reading ARPA lm", "n-grams");
143142
pi.start();
144143
while (!std::getline(is, line).eof()) {
145144
++lineNumber;
@@ -205,7 +204,10 @@ void ArpaLm::read() {
205204
tokens[n] = t;
206205
}
207206
else {
208-
warning("unknown syntactic token '%s' in line %d", word.c_str(), lineNumber);
207+
if (unknownSyntacticTokenMap.find(word) == unknownSyntacticTokenMap.end()) {
208+
unknownSyntacticTokenMap.insert(word);
209+
warning("unknown syntactic token '%s' in line %d", word.c_str(), lineNumber);
210+
}
209211
break;
210212
}
211213
}
@@ -224,6 +226,7 @@ void ArpaLm::read() {
224226
}
225227
}
226228
pi.finish();
229+
log("Unknown syntactic tokens in total: %lu", unknownSyntacticTokenMap.size());
227230
if (state != postamble)
228231
error("Premature end of language model file.");
229232
dependency_.setValue(md5);
@@ -232,9 +235,6 @@ void ArpaLm::read() {
232235
*/
233236
initialize(&*data->items.begin(), &(*(data->items.end() - 1)) + 1);
234237
delete data;
235-
236-
if (tempfile.size())
237-
std::remove(tempfile.c_str());
238238
}
239239

240240
ArpaClassLm::ArpaClassLm(const Core::Configuration& c, Bliss::LexiconRef l)

0 commit comments

Comments
 (0)