Add suport for Tokenized BLEU

mkardas · mkardas · commit 0523bfe67c68 · 2019-10-08T03:41:29.000+02:00
* change "BLEU score" to "SacreBLEU"
* accept tokenization function to compute tokenized BLEU
* return tokenized BLEU as "BLEU score" for direct comparison with
paperswithcode results
* update docs
diff --git a/docs/docs/wmt.md b/docs/docs/wmt.md
@@ -76,6 +76,24 @@ evaluator = WMTEvaluator(
 
 The above will directly compare with the result of the paper when run on the server.
 
+By default the evaluator computes a detokenized mixed-case SacreBLEU score.
+To get a tokenized BLEU score as well, during construction of the evaluator set
+a `tokenization: Callable[[str], str]` parameter to a function that tokenizes
+an input segment and returns segment with tokens separated by space, f.e.:
+
+``` python
+def get_tokenization():
+    mt = sacremoses.MosesTokenizer()
+    def tokenize(sentence):
+        return mt.tokenize(sentence, return_str=True)
+    return tokenize
+
+evaluator = WMTEvaluator(
+    ...,
+    tokenization=get_tokenization()
+)
+```
+
 Instead of parsing the dataset files by yourself you can access raw segments as strings:
 
 ``` python
diff --git a/sotabencheval/machine_translation/metrics.py b/sotabencheval/machine_translation/metrics.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict, List, Callable
 from collections import OrderedDict
 from sacrebleu import corpus_bleu
 
@@ -10,12 +10,16 @@
 
 
 class TranslationMetrics:
-    def __init__(self, source_dataset_path: Path, target_dataset_path):
+    def __init__(self,
+                 source_dataset_path: Path,
+                 target_dataset_path: Path,
+                 tokenization: Callable[[str], str] = None):
         self._src_dataset_path = source_dataset_path
         self._dst_dataset_path = target_dataset_path
         self.answers = {}
         self.source_documents, self.source_segments = self._load_dataset(self._src_dataset_path)
         self._target_documents, self._target_segments = self._load_dataset(self._dst_dataset_path)
+        self._tokenization = tokenization
         self._results = None
 
     def _load_dataset(self, dataset_path):
@@ -41,12 +45,16 @@ def evaluate(self, ignore_missing=False):
             target_segments = {sid: text for sid, text in self._target_segments.items() if sid in keep}
         else:
             target_segments = self._target_segments
-        references = [[target for target in target_segments.values()]]
         answers = [self.answers.get(sid, "") for sid in target_segments]
-        bleu = corpus_bleu(answers, references)
-        self._results = {
-            'BLEU score': bleu.score
-        }
+        references = [target for target in target_segments.values()]
+        bleu = corpus_bleu(answers, [references])
+        self._results = {'SacreBLEU': bleu.score}
+
+        if self._tokenization is not None:
+            tok_answers = [self._tokenization(answer) for answer in answers]
+            tok_references = [self._tokenization(target) for target in references]
+            tok_bleu = corpus_bleu(tok_answers, [tok_references], tokenize='none', force=True)
+            self._results['BLEU score'] = tok_bleu.score
 
     @property
     def has_data(self):
diff --git a/sotabencheval/machine_translation/wmt.py b/sotabencheval/machine_translation/wmt.py
@@ -3,7 +3,7 @@
 from sotabencheval.machine_translation.languages import Language
 from sotabencheval.machine_translation.metrics import TranslationMetrics
 from sotabencheval.utils import get_max_memory_allocated
-from typing import Dict
+from typing import Dict, Callable
 from pathlib import Path
 from enum import Enum
 import time
@@ -33,7 +33,8 @@ def __init__(self,
                  paper_arxiv_id: str = None,
                  paper_pwc_id: str = None,
                  paper_results: dict = None,
-                 model_description=None):
+                 model_description: str = None,
+                 tokenization: Callable[[str], str] = None):
         super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description)
         self.root = change_root_if_server(root=local_root,
                                           server_root=".data/nlp/wmt")
@@ -51,7 +52,7 @@ def __init__(self,
         self.source_dataset_path = Path(self.root) / source_dataset_filename
         self.target_dataset_path = Path(self.root) / target_dataset_filename
 
-        self.metrics = TranslationMetrics(self.source_dataset_path, self.target_dataset_path)
+        self.metrics = TranslationMetrics(self.source_dataset_path, self.target_dataset_path, tokenization)
 
     def _get_source_dataset_filename(self):
         if self.dataset == WMTDataset.News2014: