@@ -13,6 +13,53 @@ class SQuADVersion(Enum):
1313
1414
1515class SQuADEvaluator (BaseEvaluator ):
16+ """Evaluator for Stanford Question Answering Dataset v1.1 and v2.0 benchmarks.
17+
18+ Examples:
19+ Evaluate a BiDAF model from the AllenNLP repository on SQuAD 1.1 development set:
20+
21+ .. code-block:: python
22+
23+ from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
24+
25+ from allennlp.data import DatasetReader
26+ from allennlp.data.iterators import DataIterator
27+ from allennlp.models.archival import load_archive
28+ from allennlp.nn.util import move_to_device
29+
30+ def load_model(url, batch_size=64):
31+ archive = load_archive(url, cuda_device=0)
32+ model = archive.model
33+ reader = DatasetReader.from_params(archive.config["dataset_reader"])
34+ iterator_params = archive.config["iterator"]
35+ iterator_params["batch_size"] = batch_size
36+ data_iterator = DataIterator.from_params(iterator_params)
37+ data_iterator.index_with(model.vocab)
38+ return model, reader, data_iterator
39+
40+ def evaluate(model, dataset, data_iterator, evaluator):
41+ model.eval()
42+ evaluator.reset_time()
43+ for batch in data_iterator(dataset, num_epochs=1, shuffle=False):
44+ batch = move_to_device(batch, 0)
45+ predictions = model(**batch)
46+ answers = {metadata['id']: prediction
47+ for metadata, prediction in zip(batch['metadata'], predictions['best_span_str'])}
48+ evaluator.add(answers)
49+ if evaluator.cache_exists:
50+ break
51+
52+ evaluator = SQuADEvaluator(local_root="data/nlp/squad", model_name="BiDAF (single)",
53+ paper_arxiv_id="1611.01603", version=SQuADVersion.V11)
54+
55+ model, reader, data_iter =\
56+ load_model("https://allennlp.s3.amazonaws.com/models/bidaf-model-2017.09.15-charpad.tar.gz")
57+ dataset = reader.read(evaluator.dataset_path)
58+ evaluate(model, dataset, data_iter, evaluator)
59+ evaluator.save()
60+ print(evaluator.results)
61+ """
62+
1663 task = "Question Answering"
1764
1865 def __init__ (self ,
@@ -24,6 +71,38 @@ def __init__(self,
2471 paper_results : dict = None ,
2572 model_description = None ,
2673 version : SQuADVersion = SQuADVersion .V20 ):
74+ """
75+ Creates an evaluator for SQuAD v1.1 or v2.0 Question Answering benchmarks.
76+
77+ :param local_root: Path to the directory where the dataset files are located locally.
78+ Ignored when run on sotabench server.
79+ :param dataset_filename: Local filename of the JSON file with the SQuAD dataset.
80+ If None, the standard filename is used, based on :param:`version`.
81+ Ignored when run on sotabench server.
82+ :param model_name: The name of the model from the
83+ paper - if you want to link your build to a model from a
84+ machine learning paper. See the SQuAD benchmarks pages for model names,
85+ (f.e., https://sotabench.com/benchmarks/question-answering-on-squad11-dev)
86+ on the paper leaderboard or models yet to try tabs.
87+ :param paper_arxiv_id: Optional linking to arXiv if you
88+ want to link to papers on the leaderboard; put in the
89+ corresponding paper's arXiv ID, e.g. '1907.10529'.
90+ :param paper_pwc_id: Optional linking to Papers With Code;
91+ put in the corresponding papers with code URL slug, e.g.
92+ 'spanbert-improving-pre-training-by'
93+ :param paper_results: If the paper model you are reproducing
94+ does not have model results on sotabench.com, you can specify
95+ the paper results yourself through this argument, where keys
96+ are metric names, values are metric values. e.g:
97+
98+ {'EM': 0.858, 'F1': 0.873}.
99+
100+ Ensure that the metric names match those on the sotabench
101+ leaderboard - for SQuAD benchmarks it should be `EM` for exact match
102+ and `F1` for F1 score. Make sure to use results of evaluation on a development set.
103+ :param model_description: Optional model description.
104+ :param version: Which dataset to evaluate on, either `SQuADVersion.V11` or `SQuADVersion.V20`.
105+ """
27106 super ().__init__ (model_name , paper_arxiv_id , paper_pwc_id , paper_results , model_description )
28107 self .root = change_root_if_server (root = local_root ,
29108 server_root = ".data/nlp/squad" )
@@ -35,6 +114,23 @@ def __init__(self,
35114 self .metrics = SQuADMetrics (self .dataset_path , version )
36115
37116 def add (self , answers : Dict [str , str ]):
117+ """
118+ Updates the evaluator with new results
119+
120+ :param answers: a dictionary, where keys are question ids and values are text answers.
121+ For unanswerable questions (SQuAD v2.0) the answer should be an empty string.
122+
123+ Examples:
124+ Update the evaluator with two results:
125+
126+ .. code-block:: python
127+
128+ my_evaluator.add({
129+ "57296d571d04691400779413": "itself",
130+ "5a89117e19b91f001a626f2d": ""
131+ })
132+ """
133+
38134 self .metrics .add (answers )
39135
40136 if not self .first_batch_processed and self .metrics .has_data :
@@ -45,10 +141,30 @@ def add(self, answers: Dict[str, str]):
45141 self .first_batch_processed = True
46142
47143 def reset (self ):
144+ """
145+ Removes already added answers
146+
147+ When checking if the model should be rerun on whole dataset it is first run on a smaller subset
148+ and the results are compared with values cached on sotabench server (the check is not performed
149+ when running locally.) Ideally, the smaller subset is just the first batch, so no additional
150+ computation is needed. However, for more complex multistage pipelines it may be simpler to
151+ run the model twice - on a small dataset and (if necessary) on the full dataset. In that case
152+ :func:`reset` needs to be called before the second run so values from the first run are not reported.
153+
154+ .. seealso:: :func:`cache_exists`
155+ .. seealso:: :func:`reset_time`
156+ """
157+
48158 self .metrics .reset ()
49159 self .reset_time ()
50160
51161 def get_results (self ):
162+ """
163+ Gets the results for the evaluator.
164+
165+ :return: dict with `EM` (exact match score) and `F1`.
166+ """
167+
52168 if self .cached_results :
53169 return self .results
54170 self .results = self .metrics .get_results ()
0 commit comments