Docstring for BaseEvaluator

RJT1990 · RJT1990 · commit 03e1d3fbbe64 · 2019-10-09T08:47:22.000-07:00
diff --git a/sotabencheval/core/evaluator.py b/sotabencheval/core/evaluator.py
@@ -7,24 +7,88 @@
 
 
 class BaseEvaluator:
+    """Base class for evaluator objects on tasks
+
+    Currently SQuAD and WMT use this as a parent.
+
+    TODO: Refactor ImageNet, COCO, ADE20K, PASCAL to utilise this class
+
+    The core API design relies upon:
+
+    (a) Initializing an Evaluator object and linking to a paper, for example:
+
+    .. code-block:: python
+
+        from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
+
+        evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
+            version=SQuADVersion.V20)
+
+    The paper metadata allows the results to be linked to paper results when submitted to sotabench.com.
+
+    (b) Adding Predictions (usually in batch) - example below for PyTorch iterating over DataLoader:
+
+    .. code-block:: python
+
+            for i, (input, target) in enumerate(data_loader):
+                ...
+                output = model(input)
+                # potentially formatting of the output here
+                evaluator.add(output)
+
+    These results are accumulated and then evaluated - i.e. metrics are calculated once done.
+
+    (c) Saving Results
+
+    .. code-block:: python
+        evaluator.save()
+
+    Gets the evaluation results for the current predictions added to the Evaluation object - calculates metrics -
+    then run if on the server, serializes results to a sotabench_results.json file which is processed and results
+    are stored on the server.
+
+    These three steps: initialization -> adding predictions -> saving and evaluating results are the core API.
+    They should be capable of integration with any existing evaluation logic in your repository.
+    """
+
     def __init__(self,
                  model_name: str = None,
                  paper_arxiv_id: str = None,
                  paper_pwc_id: str = None,
                  paper_results: dict = None,
                  model_description=None,):
+        """
+        Initializes an BaseEvaluator like object
+
+        :param model_name: (str) The name of the model, for example 'ResNet-101', which will be saved to sotabench.com
+        :param paper_arxiv_id: (str, optional) The paper that the model is linked to, e.g. '1906.06423'
+        :param paper_pwc_id: (str, optional) The PWC paper id (slug), e.g. 'albert-a-lite-bert-for-self-supervised'
+        :param paper_results: (dict, optional) If the paper you are linking to does not have results on sotabench,
+        then you can add paper results here. This will be a dictionary with keys as metric names, and values as metric
+        values. This will be benchmark specific.
+        :param model_description: (str, optional) Optional description for the model; this can contain details about
+        where the weights are from, details about training, and more. This will appear in an info box for the model
+        when it is displayed on sotabench.com.
+        """
+
+        # Model metadata
+
         self.model_name = model_name
         self.paper_arxiv_id = paper_arxiv_id
         self.paper_pwc_id = paper_pwc_id
         self.paper_results = paper_results
         self.model_description = model_description
 
+        # Backend variables for hashing and caching
+
         self.first_batch_processed = False
         self.batch_hash = None
         self.cached_results = False
         self.results = None
         self._cache_exists = None
 
+        # Speed and memory metrics
+
         self.init_time = time.time()
         self.speed_mem_metrics = {}
 
@@ -35,11 +99,15 @@ def cache_exists(self):
         then sets self.results to cached results and returns True.
 
         You can use this property for control flow to break a for loop over a dataset
-        after the first iteration. This prevents rerunning the same calculation for the
+        after the first iteration. This prevents re-running the same calculation for the
         same model twice.
 
+        Q: Why should the user use this?
+        A: If you want fast "continuous evaluation" and don't want to avoid rerunning the same model over and over
+            each time you commit something new to your repository.
+
         Examples:
-            Breaking a for loop
+            Breaking a for loop if the model is the same as last time we ran
 
             .. code-block:: python
 
@@ -84,17 +152,76 @@ def cache_exists(self):
         return self._cache_exists
 
     def cache_values(self, **kwargs):
+        """
+        Takes in keyword argument and converts to a hashable (cachable) format for each
+
+        :param kwargs: keyword argument
+        :return: cachable version of the keyword arguments
+        """
         return cache_value(kwargs)
 
     def reset_time(self):
+        """
+        Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation
+        appropriately, for example:
+
+        .. code-block:: python
+
+            from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
+
+            evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
+                version=SQuADVersion.V20)
+
+            # processing/setup logic here
+
+            evaluator.reset_time()
+
+            for i, (input, target) in enumerate(data_loader):
+                ...
+                output = model(input)
+                # potentially formatting of the output here
+                evaluator.add(output)
+
+            evaluator.save()
+
+        Above we may have processing logic inbetween the evaluator initialization and the actual evaluation loop, so
+        we reset the timer so it's a fair timing of the evaluation (and not setup steps like data processing, loading
+        the model etc).
+
+        :return: void - resets self.init_time
+        """
         self.init_time = time.time()
 
     def save(self, **kwargs):
         """
         Calculate results and then put into a BenchmarkResult object
 
-        On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded
-        on the platform.
+        On the sotabench.com server, this will produce a JSON file serialisation in sotabench_results.json and results
+        will be recorded on the platform.
+
+        Users should save once all predictions are added, for instance:
+
+        .. code-block:: python
+
+            from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
+
+            evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
+                version=SQuADVersion.V20)
+
+            # processing/setup logic here
+
+            evaluator.reset_time()
+
+            for i, (input, target) in enumerate(data_loader):
+                ...
+                output = model(input)
+                # potentially formatting of the output here
+                evaluator.add(output)
+
+            evaluator.save()
+
+        Here once we have added all the predictions to the evaluator, we .save() so we evaluate and, if on the server,
+        results are serialized and saved to the server.
 
         :return: BenchmarkResult object with results and metadata
         """