Document SQuADEvaluator

mkardas · mkardas · commit 033be865c3d4 · 2019-10-09T22:35:33.000+02:00
diff --git a/docs/docs/squad.md b/docs/docs/squad.md
@@ -18,35 +18,12 @@ You can write whatever you want in your `sotabench.py` file to get model predict
 But you will need to record your results for the server, and you'll want to avoid doing things like
 downloading the dataset on the server. So you should:
 
-- **Point to the server SQuAD data path** - popular datasets are pre-downloaded on the server.
 - **Include an Evaluation object** in `sotabench.py` file to record the results.
+- **Point to the server SQuAD data path** - popular datasets are pre-downloaded on the server.
 - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
 
 We explain how to do these various steps below.
 
-## Server Data Location
-
-The SQuAD development data is located in the root of your repository on the server at `.data/nlp/squad`.
-In this folder is contained:
-
-- `dev-v1.1.json` - containing SQuAD v1.1 development dataset
-- `dev-v2.0.json` - containing SQuAD v2.0 development dataset
-
-Your local files may have a different file directory structure, so you
-can use control flow like below to change the data path if the script is being
-run on sotabench servers:
-
-``` python
-from sotabencheval.utils import is_server
-
-if is_server():
-    DATA_ROOT = '.data/nlp/squad'
-else: # local settings
-    DATA_ROOT = '/home/ubuntu/my_data/'
-```
-
-This will detect if `sotabench.py` is being run on the server and change behaviour accordingly.
-
 ## How Do I Initialize an Evaluator?
 
 Add this to your code - before you start batching over the dataset and making predictions:
@@ -76,6 +53,28 @@ evaluator = SQuADEvaluator(model_name='SpanBERT',
 
 The above will directly compare with the result of the paper when run on the server.
 
+## Server Data Location
+
+The SQuAD development data is located in the root of your repository on the server at `.data/nlp/squad`.
+In this folder is contained:
+
+- `dev-v1.1.json` - containing SQuAD v1.1 development dataset
+- `dev-v2.0.json` - containing SQuAD v2.0 development dataset
+
+You can use `evaluator.dataset_path: Path` to get a path to the dataset json file.
+In the example above it resolves to `.data/nlp/squad/dev-v2.0.json` on
+sotabench server and `./dev-v2.0.json` when run locally.
+If you want to use a non-standard file name or location when running locally
+you can override the defaults like this:
+
+``` python
+evaluator = SQuADEvaluator(
+    ...,
+    local_root='mydatasets',
+    dataset_filename='data.json'
+)
+```
+
 ## How Do I Evaluate Predictions?
 
 The evaluator object has an `.add(answers: Dict[str, str])` method to submit predictions by batch or in full.
@@ -153,6 +152,52 @@ we simply return hashed results rather than running the whole evaluation again.
 Caching is very useful if you have large models, or a repository that is evaluating
 multiple models, as it speeds up evaluation significantly.
 
+## A Full sotabench.py Example
+
+Below we show an implementation for a model from the AllenNLP repository. This
+incorporates all the features explained above: (a) using the SQuAD Evaluator,
+(b) using custom dataset location when run locally, and (c) the evaluation caching logic.
+
+``` python
+from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
+
+from allennlp.data import DatasetReader
+from allennlp.data.iterators import DataIterator
+from allennlp.models.archival import load_archive
+from allennlp.nn.util import move_to_device
+
+def load_model(url, batch_size=64):
+    archive = load_archive(url, cuda_device=0)
+    model = archive.model
+    reader = DatasetReader.from_params(archive.config["dataset_reader"])
+    iterator_params = archive.config["iterator"]
+    iterator_params["batch_size"] = batch_size
+    data_iterator = DataIterator.from_params(iterator_params)
+    data_iterator.index_with(model.vocab)
+    return model, reader, data_iterator
+
+def evaluate(model, dataset, data_iterator, evaluator):
+    model.eval()
+    evaluator.reset_time()
+    for batch in data_iterator(dataset, num_epochs=1, shuffle=False):
+        batch = move_to_device(batch, 0)
+        predictions = model(**batch)
+        answers = {metadata['id']: prediction
+                   for metadata, prediction in zip(batch['metadata'], predictions['best_span_str'])}
+        evaluator.add(answers)
+        if evaluator.cache_exists:
+            break
+
+evaluator = SQuADEvaluator(local_root="data/nlp/squad", model_name="BiDAF (single)",
+    paper_arxiv_id="1611.01603", version=SQuADVersion.V11)
+
+model, reader, data_iter =\
+    load_model("https://allennlp.s3.amazonaws.com/models/bidaf-model-2017.09.15-charpad.tar.gz")
+dataset = reader.read(evaluator.dataset_path)
+evaluate(model, dataset, data_iter, evaluator)
+evaluator.save()
+print(evaluator.results)
+```
 
 ## Need More Help?
 
diff --git a/docs/docs/wmt.md b/docs/docs/wmt.md
@@ -209,7 +209,7 @@ evaluator = WMTEvaluator(
 model = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model',
     force_reload=True, tokenizer='moses', bpe='fastbpe').cuda()
 
-for sid, text in tqdm(evaluator.metrics.source_segments.items()):
+for sid, text in tqdm(evaluator.source_segments.items()):
     translated = model.translate(text)
     evaluator.add({sid: translated})
     if evaluator.cache_exists:
diff --git a/sotabencheval/core/evaluator.py b/sotabencheval/core/evaluator.py
@@ -58,7 +58,7 @@ def __init__(self,
                  paper_results: dict = None,
                  model_description=None,):
         """
-        Initializes an BaseEvaluator like object
+        Initializes a BaseEvaluator like object
 
         :param model_name: (str) The name of the model, for example 'ResNet-101', which will be saved to sotabench.com
         :param paper_arxiv_id: (str, optional) The paper that the model is linked to, e.g. '1906.06423'
diff --git a/sotabencheval/machine_translation/wmt.py b/sotabencheval/machine_translation/wmt.py
@@ -38,7 +38,7 @@ class WMTEvaluator(BaseEvaluator):
             model = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model',
                 force_reload=True, tokenizer='moses', bpe='fastbpe').cuda()
 
-            for sid, text in tqdm(evaluator.metrics.source_segments.items()):
+            for sid, text in tqdm(evaluator.source_segments.items()):
                 translated = model.translate(text)
                 evaluator.add({sid: translated})
                 if evaluator.cache_exists:
@@ -87,9 +87,9 @@ def __init__(self,
             Ignored when run on sotabench server.
         :param model_name: The name of the model from the
             paper - if you want to link your build to a model from a
-            machine learning paper. See the WMT benchmarks page for model names,
+            machine learning paper. See the WMT benchmarks pages for model names,
             (f.e., https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-german)
-            on the paper leaderboard or models yet to try tab.
+            on the paper leaderboard or models yet to try tabs.
         :param paper_arxiv_id: Optional linking to arXiv if you
             want to link to papers on the leaderboard; put in the
             corresponding paper's arXiv ID, e.g. '1907.06616'.
@@ -105,7 +105,7 @@ def __init__(self,
 
             Ensure that the metric names match those on the sotabench
             leaderboard - for WMT benchmarks it should be `SacreBLEU` for de-tokenized
-            mix-cased BLEU score and `BLEU score` for tokenized BLEU.
+            case sensitive BLEU score and `BLEU score` for tokenized BLEU.
         :param model_description: Optional model description.
         :param tokenization: An optional tokenization function to compute tokenized BLEU score.
             It takes a single string - a segment to tokenize, and returns a string with tokens
@@ -178,7 +178,7 @@ def add(self, answers: Dict[str, str]):
                     'bbc.381790#3': 'Sie ist aufgrund von Plänen entstanden, den Namen...'
                 })
 
-        .. seealso:: `sotabencheval.machine_translation.TranslationMetrics.source_segments`
+        .. seealso:: `source_segments`
         """
 
         self.metrics.add(answers)
@@ -190,15 +190,53 @@ def add(self, answers: Dict[str, str]):
             )
             self.first_batch_processed = True
 
+    @property
+    def source_segments(self):
+        """
+        Ordered dictionary of all segments to translate with segments ids as keys. The same segments ids
+        have to be used when submitting translations with :func:`add`.
+
+        Examples:
+
+            .. code-block:: python
+
+                for segment_id, text in my_evaluator.source_segments.items():
+                    translated = model(text)
+                    my_evaluator.add({segment_id: translated})
+
+        .. seealso: `source_documents`
+        """
+
+        return self.metrics.source_segments
+
+    @property
+    def source_documents(self):
+        """
+        List of all documents to translate
+
+        Examples:
+
+            .. code-block:: python
+
+                for document in my_evaluator.source_documents:
+                    for segment in document.segments:
+                        translated = model(segment.text)
+                        my_evaluator.add({segment.id: translated})
+
+        .. seealso: `source_segments`
+        """
+
+        return self.metrics.source_documents
+
     def reset(self):
         """
         Removes already added translations
 
         When checking if the model should be rerun on whole dataset it is first run on a smaller subset
         and the results are compared with values cached on sotabench server (the check is not performed
         when running locally.) Ideally, the smaller subset is just the first batch, so no additional
-        computation is needed. However, for more complex multistage pipelines it maybe simpler to
-        run a model twice - on a small dataset and (if necessary) on the full dataset. In that case
+        computation is needed. However, for more complex multistage pipelines it may be simpler to
+        run the model twice - on a small dataset and (if necessary) on the full dataset. In that case
         :func:`reset` needs to be called before the second run so values from the first run are not reported.
 
         .. seealso:: :func:`cache_exists`
@@ -212,7 +250,7 @@ def get_results(self):
         Gets the results for the evaluator. Empty string is assumed for segments for which in translation
         was provided.
 
-        :return: dict with `SacreBLEU` and `BLEU score`
+        :return: dict with `SacreBLEU` and `BLEU score`.
         """
 
         if self.cached_results:
diff --git a/sotabencheval/question_answering/squad.py b/sotabencheval/question_answering/squad.py
@@ -13,6 +13,53 @@ class SQuADVersion(Enum):
 
 
 class SQuADEvaluator(BaseEvaluator):
+    """Evaluator for Stanford Question Answering Dataset v1.1 and v2.0 benchmarks.
+
+    Examples:
+        Evaluate a BiDAF model from the AllenNLP repository on SQuAD 1.1 development set:
+
+        .. code-block:: python
+
+            from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
+
+            from allennlp.data import DatasetReader
+            from allennlp.data.iterators import DataIterator
+            from allennlp.models.archival import load_archive
+            from allennlp.nn.util import move_to_device
+
+            def load_model(url, batch_size=64):
+                archive = load_archive(url, cuda_device=0)
+                model = archive.model
+                reader = DatasetReader.from_params(archive.config["dataset_reader"])
+                iterator_params = archive.config["iterator"]
+                iterator_params["batch_size"] = batch_size
+                data_iterator = DataIterator.from_params(iterator_params)
+                data_iterator.index_with(model.vocab)
+                return model, reader, data_iterator
+
+            def evaluate(model, dataset, data_iterator, evaluator):
+                model.eval()
+                evaluator.reset_time()
+                for batch in data_iterator(dataset, num_epochs=1, shuffle=False):
+                    batch = move_to_device(batch, 0)
+                    predictions = model(**batch)
+                    answers = {metadata['id']: prediction
+                               for metadata, prediction in zip(batch['metadata'], predictions['best_span_str'])}
+                    evaluator.add(answers)
+                    if evaluator.cache_exists:
+                        break
+
+            evaluator = SQuADEvaluator(local_root="data/nlp/squad", model_name="BiDAF (single)",
+                paper_arxiv_id="1611.01603", version=SQuADVersion.V11)
+
+            model, reader, data_iter =\
+                load_model("https://allennlp.s3.amazonaws.com/models/bidaf-model-2017.09.15-charpad.tar.gz")
+            dataset = reader.read(evaluator.dataset_path)
+            evaluate(model, dataset, data_iter, evaluator)
+            evaluator.save()
+            print(evaluator.results)
+    """
+
     task = "Question Answering"
 
     def __init__(self,
@@ -24,6 +71,38 @@ def __init__(self,
                  paper_results: dict = None,
                  model_description=None,
                  version: SQuADVersion = SQuADVersion.V20):
+        """
+        Creates an evaluator for SQuAD v1.1 or v2.0 Question Answering benchmarks.
+
+        :param local_root: Path to the directory where the dataset files are located locally.
+            Ignored when run on sotabench server.
+        :param dataset_filename: Local filename of the JSON file with the SQuAD dataset.
+            If None, the standard filename is used, based on :param:`version`.
+            Ignored when run on sotabench server.
+        :param model_name: The name of the model from the
+            paper - if you want to link your build to a model from a
+            machine learning paper. See the SQuAD benchmarks pages for model names,
+            (f.e., https://sotabench.com/benchmarks/question-answering-on-squad11-dev)
+            on the paper leaderboard or models yet to try tabs.
+        :param paper_arxiv_id: Optional linking to arXiv if you
+            want to link to papers on the leaderboard; put in the
+            corresponding paper's arXiv ID, e.g. '1907.10529'.
+        :param paper_pwc_id: Optional linking to Papers With Code;
+            put in the corresponding papers with code URL slug, e.g.
+            'spanbert-improving-pre-training-by'
+        :param paper_results: If the paper model you are reproducing
+            does not have model results on sotabench.com, you can specify
+            the paper results yourself through this argument, where keys
+            are metric names, values are metric values. e.g:
+
+                    {'EM': 0.858, 'F1': 0.873}.
+
+            Ensure that the metric names match those on the sotabench
+            leaderboard - for SQuAD benchmarks it should be `EM` for exact match
+            and `F1` for F1 score. Make sure to use results of evaluation on a development set.
+        :param model_description: Optional model description.
+        :param version: Which dataset to evaluate on, either `SQuADVersion.V11` or `SQuADVersion.V20`.
+        """
         super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description)
         self.root = change_root_if_server(root=local_root,
                                           server_root=".data/nlp/squad")
@@ -35,6 +114,23 @@ def __init__(self,
         self.metrics = SQuADMetrics(self.dataset_path, version)
 
     def add(self, answers: Dict[str, str]):
+        """
+        Updates the evaluator with new results
+
+        :param answers: a dictionary, where keys are question ids and values are text answers.
+            For unanswerable questions (SQuAD v2.0) the answer should be an empty string.
+
+        Examples:
+            Update the evaluator with two results:
+
+            .. code-block:: python
+
+                my_evaluator.add({
+                    "57296d571d04691400779413": "itself",
+                    "5a89117e19b91f001a626f2d": ""
+                })
+        """
+
         self.metrics.add(answers)
 
         if not self.first_batch_processed and self.metrics.has_data:
@@ -45,10 +141,30 @@ def add(self, answers: Dict[str, str]):
             self.first_batch_processed = True
 
     def reset(self):
+        """
+        Removes already added answers
+
+        When checking if the model should be rerun on whole dataset it is first run on a smaller subset
+        and the results are compared with values cached on sotabench server (the check is not performed
+        when running locally.) Ideally, the smaller subset is just the first batch, so no additional
+        computation is needed. However, for more complex multistage pipelines it may be simpler to
+        run the model twice - on a small dataset and (if necessary) on the full dataset. In that case
+        :func:`reset` needs to be called before the second run so values from the first run are not reported.
+
+        .. seealso:: :func:`cache_exists`
+        .. seealso:: :func:`reset_time`
+        """
+
         self.metrics.reset()
         self.reset_time()
 
     def get_results(self):
+        """
+        Gets the results for the evaluator.
+
+        :return: dict with `EM` (exact match score) and `F1`.
+        """
+
         if self.cached_results:
             return self.results
         self.results = self.metrics.get_results()
diff --git a/sotabencheval/version.py b/sotabencheval/version.py
@@ -15,6 +15,6 @@ def __repr__(self):
             f"build={self.build})"
         )
 
-version = Version(0, 0, 35)
+version = Version(0, 0, 36)
 
 __version__ = str(version)

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,6 @@ def __repr__(self):`
`15`	`15`	`f"build={self.build})"`
`16`	`16`	`)`
`17`	`17`
`18`		`-version = Version(0, 0, 35)`
	`18`	`+version = Version(0, 0, 36)`
`19`	`19`
`20`	`20`	`__version__ = str(version)`