rwth-i6
diff --git a/‎src/Bliss/SyntacticTokenMap.hh‎
Lines changed: 10 additions & 0 deletions b/‎src/Bliss/SyntacticTokenMap.hh‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/Lm/AbstractNNLanguageModel.cc‎
Lines changed: 9 additions & 2 deletions b/‎src/Lm/AbstractNNLanguageModel.cc‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎src/Lm/AbstractNNLanguageModel.hh‎
Lines changed: 3 additions & 4 deletions b/‎src/Lm/AbstractNNLanguageModel.hh‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎src/Lm/ArpaLm.cc‎
Lines changed: 19 additions & 19 deletions b/‎src/Lm/ArpaLm.cc‎
Lines changed: 19 additions & 19 deletions
@@ -62,6 +62,16 @@ public:
         verify_(s->id() < store_.size());
         return store_[s->id()];
     }
+
+    Value& operator[](const Token::Id id) {
+        verify_(id < store_.size());
+        return store_[id];
+    }
+
+    const Value& operator[](const Token::Id id) const {
+        verify_(id < store_.size());
+        return store_[id];
+    }
 };
 
 }  // namespace Bliss
 
@@ -27,7 +27,14 @@ Core::ParameterString AbstractNNLanguageModel::paramVocabUnknownWord(
         "vocab-unknown-word", "the word from the provided vocabulary file that will serve as unknown token", "");
 
 AbstractNNLanguageModel::AbstractNNLanguageModel(Core::Configuration const& c, Bliss::LexiconRef l)
-        : Core::Component(c), Precursor(c, l), collect_statistics_(paramCollectStatistics(c)), vocab_file_(paramVocabularyFile(c)), unknown_word_(paramVocabUnknownWord(config)), lexicon_(l), num_outputs_(0ul), lexicon_mapping_(), usage_histogram_() {
+        : Core::Component(c),
+          Precursor(c, l),
+          collect_statistics_(paramCollectStatistics(c)),
+          vocab_file_(paramVocabularyFile(c)),
+          unknown_word_(paramVocabUnknownWord(config)),
+          lexicon_(l),
+          num_outputs_(0ul),
+          usage_histogram_() {
     NNHistoryManager* hm = new NNHistoryManager();
     if (collect_statistics_) {
         hm->setOnReleaseHandler(std::bind(&AbstractNNLanguageModel::onRelease, this, std::placeholders::_1));
@@ -89,7 +96,7 @@ void AbstractNNLanguageModel::onRelease(HistoryHandle handle) {
     NNCacheWithStats const* c = reinterpret_cast<NNCacheWithStats const*>(handle);
     if (not c->output_used.empty()) {
         unsigned used_outputs  = std::accumulate(c->output_used.begin(), c->output_used.end(),
-                                                0u, [](unsigned sum, bool used) { return sum + (used ? 1u : 0u); });
+                                                 0u, [](unsigned sum, bool used) { return sum + (used ? 1u : 0u); });
         size_t   promille_used = static_cast<size_t>((1000.0 * used_outputs) / c->output_used.size());
         if (usage_histogram_.size() <= promille_used) {
             usage_histogram_.resize(promille_used + 1ul);
 
@@ -40,10 +40,9 @@ protected:
     std::string vocab_file_;
     std::string unknown_word_;
 
-    Bliss::LexiconRef   lexicon_;
-    size_t              num_outputs_;
-    std::vector<size_t> lexicon_mapping_;
-    std::vector<u32>    usage_histogram_;
+    Bliss::LexiconRef lexicon_;
+    size_t            num_outputs_;
+    std::vector<u32>  usage_histogram_;
 
     void loadVocabulary();
     void useOutput(NNCacheWithStats const& cache, size_t idx) const;
 
@@ -108,13 +108,9 @@ void ArpaLm::read() {
     std::string tempfile;
 
     if (paramReverseLm(config)) {
-        char* tmp = tempnam(0, "lm");
-        tempfile  = tmp;
-        ::free(tmp);
-        log() << "reversing ARPA language model into temporary file '" << tempfile << "'";
-        Lm::reverseArpaLm(filename, tempfile);
-        log() << "successfully reversed";
-        filename = tempfile;
+        log() << "reversing ARPA language model";
+        filename = Lm::reverseArpaLm(filename);
+        log() << "successfully reversed into temporary file '" << filename << "'";
     }
 
     Core::CompressedInputStream* cis = new Core::CompressedInputStream(filename.c_str());
@@ -132,14 +128,17 @@ void ArpaLm::read() {
     Core::MD5   md5;
     std::string line;
     u32         lineNumber = 0, totalNGrams = 0, expectedTotalNGrams = 0, maxTotalNGrams = 0;
-    enum { preamble,
-           sizes,
-           ngrams,
-           postamble,
-           unknown } state = preamble;
-    u32                     nGram, n;
-    Token                   tokens[maxNGramLength];
-    Core::ProgressIndicator pi("reading ARPA lm", "n-grams");
+    enum {
+        preamble,
+        sizes,
+        ngrams,
+        postamble,
+        unknown
+    } state = preamble;
+    u32                             nGram, n;
+    Token                           tokens[maxNGramLength];
+    std::unordered_set<std::string> unknownSyntacticTokenMap;
+    Core::ProgressIndicator         pi("reading ARPA lm", "n-grams");
     pi.start();
     while (!std::getline(is, line).eof()) {
         ++lineNumber;
@@ -205,7 +204,10 @@ void ArpaLm::read() {
                                 tokens[n] = t;
                             }
                             else {
-                                warning("unknown syntactic token '%s' in line %d", word.c_str(), lineNumber);
+                                if (unknownSyntacticTokenMap.find(word) == unknownSyntacticTokenMap.end()) {
+                                    unknownSyntacticTokenMap.insert(word);
+                                    warning("unknown syntactic token '%s' in line %d", word.c_str(), lineNumber);
+                                }
                                 break;
                             }
                         }
@@ -224,6 +226,7 @@ void ArpaLm::read() {
             }
     }
     pi.finish();
+    log("Unknown syntactic tokens in total: %lu", unknownSyntacticTokenMap.size());
     if (state != postamble)
         error("Premature end of language model file.");
     dependency_.setValue(md5);
@@ -232,9 +235,6 @@ void ArpaLm::read() {
     */
     initialize(&*data->items.begin(), &(*(data->items.end() - 1)) + 1);
     delete data;
-
-    if (tempfile.size())
-        std::remove(tempfile.c_str());
 }
 
 ArpaClassLm::ArpaClassLm(const Core::Configuration& c, Bliss::LexiconRef l)