77
88
99class BaseEvaluator :
10+ """Base class for evaluator objects on tasks
11+
12+ Currently SQuAD and WMT use this as a parent.
13+
14+ TODO: Refactor ImageNet, COCO, ADE20K, PASCAL to utilise this class
15+
16+ The core API design relies upon:
17+
18+ (a) Initializing an Evaluator object and linking to a paper, for example:
19+
20+ .. code-block:: python
21+
22+ from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
23+
24+ evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
25+ version=SQuADVersion.V20)
26+
27+ The paper metadata allows the results to be linked to paper results when submitted to sotabench.com.
28+
29+ (b) Adding Predictions (usually in batch) - example below for PyTorch iterating over DataLoader:
30+
31+ .. code-block:: python
32+
33+ for i, (input, target) in enumerate(data_loader):
34+ ...
35+ output = model(input)
36+ # potentially formatting of the output here
37+ evaluator.add(output)
38+
39+ These results are accumulated and then evaluated - i.e. metrics are calculated once done.
40+
41+ (c) Saving Results
42+
43+ .. code-block:: python
44+ evaluator.save()
45+
46+ Gets the evaluation results for the current predictions added to the Evaluation object - calculates metrics -
47+ then run if on the server, serializes results to a sotabench_results.json file which is processed and results
48+ are stored on the server.
49+
50+ These three steps: initialization -> adding predictions -> saving and evaluating results are the core API.
51+ They should be capable of integration with any existing evaluation logic in your repository.
52+ """
53+
1054 def __init__ (self ,
1155 model_name : str = None ,
1256 paper_arxiv_id : str = None ,
1357 paper_pwc_id : str = None ,
1458 paper_results : dict = None ,
1559 model_description = None ,):
60+ """
61+ Initializes an BaseEvaluator like object
62+
63+ :param model_name: (str) The name of the model, for example 'ResNet-101', which will be saved to sotabench.com
64+ :param paper_arxiv_id: (str, optional) The paper that the model is linked to, e.g. '1906.06423'
65+ :param paper_pwc_id: (str, optional) The PWC paper id (slug), e.g. 'albert-a-lite-bert-for-self-supervised'
66+ :param paper_results: (dict, optional) If the paper you are linking to does not have results on sotabench,
67+ then you can add paper results here. This will be a dictionary with keys as metric names, and values as metric
68+ values. This will be benchmark specific.
69+ :param model_description: (str, optional) Optional description for the model; this can contain details about
70+ where the weights are from, details about training, and more. This will appear in an info box for the model
71+ when it is displayed on sotabench.com.
72+ """
73+
74+ # Model metadata
75+
1676 self .model_name = model_name
1777 self .paper_arxiv_id = paper_arxiv_id
1878 self .paper_pwc_id = paper_pwc_id
1979 self .paper_results = paper_results
2080 self .model_description = model_description
2181
82+ # Backend variables for hashing and caching
83+
2284 self .first_batch_processed = False
2385 self .batch_hash = None
2486 self .cached_results = False
2587 self .results = None
2688 self ._cache_exists = None
2789
90+ # Speed and memory metrics
91+
2892 self .init_time = time .time ()
2993 self .speed_mem_metrics = {}
3094
@@ -35,11 +99,15 @@ def cache_exists(self):
3599 then sets self.results to cached results and returns True.
36100
37101 You can use this property for control flow to break a for loop over a dataset
38- after the first iteration. This prevents rerunning the same calculation for the
102+ after the first iteration. This prevents re-running the same calculation for the
39103 same model twice.
40104
105+ Q: Why should the user use this?
106+ A: If you want fast "continuous evaluation" and don't want to avoid rerunning the same model over and over
107+ each time you commit something new to your repository.
108+
41109 Examples:
42- Breaking a for loop
110+ Breaking a for loop if the model is the same as last time we ran
43111
44112 .. code-block:: python
45113
@@ -84,17 +152,76 @@ def cache_exists(self):
84152 return self ._cache_exists
85153
86154 def cache_values (self , ** kwargs ):
155+ """
156+ Takes in keyword argument and converts to a hashable (cachable) format for each
157+
158+ :param kwargs: keyword argument
159+ :return: cachable version of the keyword arguments
160+ """
87161 return cache_value (kwargs )
88162
89163 def reset_time (self ):
164+ """
165+ Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation
166+ appropriately, for example:
167+
168+ .. code-block:: python
169+
170+ from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
171+
172+ evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
173+ version=SQuADVersion.V20)
174+
175+ # processing/setup logic here
176+
177+ evaluator.reset_time()
178+
179+ for i, (input, target) in enumerate(data_loader):
180+ ...
181+ output = model(input)
182+ # potentially formatting of the output here
183+ evaluator.add(output)
184+
185+ evaluator.save()
186+
187+ Above we may have processing logic inbetween the evaluator initialization and the actual evaluation loop, so
188+ we reset the timer so it's a fair timing of the evaluation (and not setup steps like data processing, loading
189+ the model etc).
190+
191+ :return: void - resets self.init_time
192+ """
90193 self .init_time = time .time ()
91194
92195 def save (self , ** kwargs ):
93196 """
94197 Calculate results and then put into a BenchmarkResult object
95198
96- On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded
97- on the platform.
199+ On the sotabench.com server, this will produce a JSON file serialisation in sotabench_results.json and results
200+ will be recorded on the platform.
201+
202+ Users should save once all predictions are added, for instance:
203+
204+ .. code-block:: python
205+
206+ from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
207+
208+ evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
209+ version=SQuADVersion.V20)
210+
211+ # processing/setup logic here
212+
213+ evaluator.reset_time()
214+
215+ for i, (input, target) in enumerate(data_loader):
216+ ...
217+ output = model(input)
218+ # potentially formatting of the output here
219+ evaluator.add(output)
220+
221+ evaluator.save()
222+
223+ Here once we have added all the predictions to the evaluator, we .save() so we evaluate and, if on the server,
224+ results are serialized and saved to the server.
98225
99226 :return: BenchmarkResult object with results and metadata
100227 """
0 commit comments