Update imagenet evaluation to batch design

RJT1990 · RJT1990 · commit e85761c71c6c · 2019-09-18T10:44:19.000+01:00
diff --git a/sotabencheval/image_classification/imagenet.py b/sotabencheval/image_classification/imagenet.py
@@ -1,37 +1,95 @@
 import numpy as np
+import pickle
 from sotabenchapi.core import BenchmarkResult, check_inputs
 import tqdm
 
 from sotabencheval.utils import AverageMeter
 from .utils import top_k_accuracy_score
 
-class ImageNet:
+
+
+class ImageNetEvaluator(object):
+    """`ImageNet <https://www.sotabench.com/benchmark/imagenet>`_ benchmark.
+
+    Examples:
+        Evaluate a ResNeXt model from the torchvision repository:
+
+        .. code-block:: python
+
+            import numpy as np
+            import PIL
+            import torch
+            from sotabencheval.image_classification import ImageNetEvaluator
+            from torchvision.models.resnet import resnext101_32x8d
+            import torchvision.transforms as transforms
+            from torchvision.datasets import ImageNet
+            from torch.utils.data import DataLoader
+
+            model = resnext101_32x8d(pretrained=True)
+
+            # Define the transforms need to convert ImageNet data to expected
+            # model input
+            normalize = transforms.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+            )
+            input_transform = transforms.Compose([
+                transforms.Resize(256, PIL.Image.BICUBIC),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                normalize,
+            ])
+
+            test_dataset = ImageNet(
+                './data',
+                split="val",
+                transform=input_transform,
+                target_transform=None,
+                download=True,
+            )
+
+            test_loader = DataLoader(
+                test_dataset,
+                batch_size=128,
+                shuffle=False,
+                num_workers=4,
+                pin_memory=True,
+            )
+
+            model = model.cuda()
+            model.eval()
+
+            final_output = None
+            evaluator = ImageNetEvaluator(
+                             paper_model_name='ResNeXt-101-32x8d',
+                             paper_arxiv_id='1611.05431')
+
+            with torch.no_grad():
+                for i, (input, target) in enumerate(test_loader):
+                    input = input.to(device=device, non_blocking=True)
+                    target = target.to(device=device, non_blocking=True)
+                    output = model(input)
+
+                    image_ids = [img[0].split('/')[-1].replace('.JPEG', '') for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]]
+
+                    evaluator.update(dict(zip(image_ids, list(output.cpu().numpy()))))
+
+            print(evaluator.get_results())
+
+            evaluator.save()
+    """
+
     task = "Image Classification"
 
-    @classmethod
-    @check_inputs
-    def benchmark(
-        cls,
-        results_dict,
-        data_root: str = "./.data/vision/imagenet",
-        paper_model_name: str = None,
-        paper_arxiv_id: str = None,
-        paper_pwc_id: str = None,
-        paper_results: dict = None,
-        pytorch_hub_url: str = None,
-        model_description=None,
-    ) -> BenchmarkResult:
+    def __init__(self,
+                 paper_model_name: str = None,
+                 paper_arxiv_id: str = None,
+                 paper_pwc_id: str = None,
+                 paper_results: dict = None,
+                 pytorch_hub_url: str = None,
+                 model_description=None,):
         """Benchmarking function.
 
         Args:
-            results_dict (dict): dict with keys as image IDs and values as a 1D 1000 x 1 np.ndarrays
-            of logits. For example: {'ILSVRC2012_val_00000293': array([1.27443619e+01, ...]), ...}. There
-            should be 5000 key/value pairs for the validation set.
-            data_root (str): The location of the ImageNet dataset - change this
-                parameter when evaluating locally if your ImageNet data is
-                located in a different folder (or alternatively if you want to
-                download to an alternative location).
-            model_description (str, optional): Optional model description.
             paper_model_name (str, optional): The name of the model from the
                 paper - if you want to link your build to a machine learning
                 paper. See the ImageNet benchmark page for model names,
@@ -56,64 +114,108 @@ def benchmark(
             pytorch_hub_url (str, optional): Optional linking to PyTorch Hub
                 url if your model is linked there; e.g:
                 'nvidia_deeplearningexamples_waveglow'.
+            model_description (str, optional): Optional model description.
         """
 
-        print("Benchmarking on ImageNet...")
+        self.paper_model_name = paper_model_name
+        self.paper_arxiv_id = paper_arxiv_id
+        self.paper_pwc_id = paper_pwc_id
+        self.paper_results = paper_results
+        self.pytorch_hub_url = pytorch_hub_url
+        self.model_description = model_description
 
-        config = locals()
+        self.top1 = None
+        self.top5 = None
 
-        try:
-            test_dataset = cls.dataset(
-                data_root,
-                split="val",
-                transform=cls.input_transform,
-                target_transform=None,
-                download=True,
-            )
-        except Exception:
-            test_dataset = cls.dataset(
-                data_root,
-                split="val",
-                transform=cls.input_transform,
-                target_transform=None,
-                download=False,
-            )
+        with open('imagenet_val_targets.pkl', 'rb') as handle:
+            self.targets = pickle.load(handle)
+
+        self.outputs = {}
+        self.results = None
+
+    def update(self, output_dict: dict):
+        """
+        Update the evaluator with new results
+
+
+        :param output_dict (dict): Where keys are image IDs, and each value should be an 1D np.ndarray of size 1000
+        containing logits for that image ID.
+        :return: void - updates self.outputs with the new IDSs and prediction
+
+        Examples:
+            Update the evaluator with two results:
+
+            .. code-block:: python
+
+                my_evaluator.update({'ILSVRC2012_val_00000293': np.array([1.04243, ...]),
+                'ILSVRC2012_val_00000294': np.array([-2.3677, ...])})
+        """
+
+        self.outputs = dict(list(self.outputs.items()) + list(output_dict.items()))
+
+    def get_results(self):
+        """
+        Gets the results for the evaluator. This method only runs if predictions for all 5,000 ImageNet validation
+        images are available. Otherwise raises an error and informs you of the missing or unmatched IDs.
+
+        :return: dict with Top 1 and Top 5 Accuracy
+        """
+
+        if set(self.targets.keys()) != set(self.outputs.keys()):
+            missing_ids = set(self.targets.keys()) - set(self.outputs.keys())
+            unmatched_ids = set(self.outputs.keys()) - set(self.targets.keys())
 
-        top1 = AverageMeter()
-        top5 = AverageMeter()
+            if len(unmatched_ids) > 0:
+                raise AttributeError('''There are {mis_no} missing and {un_no} unmatched image IDs\n\n'''
+                                     '''Missing IDs are {missing}\n\n'''
+                                     '''Unmatched IDs are {unmatched}'''.format(mis_no=len(missing_ids),
+                                                                                un_no=len(unmatched_ids),
+                                                                                missing=missing_ids,
+                                                                                unmatched=unmatched_ids))
+            else:
+                raise AttributeError('''There are {mis_no} missing image IDs\n\n'''
+                                     '''Missing IDs are {missing}'''.format(mis_no=len(missing_ids),
+                                                                            missing=missing_ids))
 
-        for i, (_, target) in enumerate(tqdm.tqdm(test_dataset)):
-            image_id = test_dataset.imgs[i][0].split('/')[-1].replace('.JPEG', '')
-            output = results_dict[image_id]
-            target = target.cpu().numpy()
+                # Do the calculation only if we have all the results...
+        self.top1 = AverageMeter()
+        self.top5 = AverageMeter()
 
+        for i, dict_key in enumerate(tqdm.tqdm(self.targets.keys())):
+            output = self.outputs[dict_key]
+            target = self.targets[dict_key]
             prec1 = top_k_accuracy_score(y_true=target, y_pred=np.array([output]), k=1)
             prec5 = top_k_accuracy_score(y_true=target, y_pred=np.array([output]), k=5)
-            top1.update(prec1, 1)
-            top5.update(prec5, 1)
-
-        final_results = {
-            'Top 1 Accuracy': prec1.avg,
-            'Top 5 Accuracy': prec5.avg
-        }
-
-        print(
-            " * Acc@1 {top1:.3f} Acc@5 {top5:.3f}".format(
-                top1=final_results["Top 1 Accuracy"],
-                top5=final_results["Top 5 Accuracy"],
-            )
-        )
+            self.top1.update(prec1, 1)
+            self.top5.update(prec5, 1)
+
+        self.results = {'Top 1 Accuracy': self.top1.avg, 'Top 5 Accuracy': self.top5.avg}
+
+        return self.results
+
+    def save(self):
+        """
+        Calculate results and then put into a BenchmarkResult object
+
+        On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded
+        on the platform.
+
+        :return: BenchmarkResult object with results and metadata
+        """
+
+        if not self.results:
+            self.get_results()
 
         return BenchmarkResult(
-            task=cls.task,
-            config=config,
-            dataset=cls.dataset.__name__,
-            results=final_results,
-            pytorch_hub_id=pytorch_hub_url,
-            model=paper_model_name,
-            model_description=model_description,
-            arxiv_id=paper_arxiv_id,
-            pwc_id=paper_pwc_id,
-            paper_results=paper_results,
+            task=self.task,
+            config={},
+            dataset='ImageNet',
+            results=self.results,
+            pytorch_hub_id=self.pytorch_hub_url,
+            model=self.paper_model_name,
+            model_description=self.model_description,
+            arxiv_id=self.paper_arxiv_id,
+            pwc_id=self.paper_pwc_id,
+            paper_results=self.paper_results,
             run_hash=None,
         )
diff --git a/sotabencheval/image_classification/utils.py b/sotabencheval/image_classification/utils.py
@@ -1,9 +1,10 @@
 import numpy as np
 
 def top_k_accuracy_score(y_true, y_pred, k=5, normalize=True):
-    """Top k Accuracy classification score.
-    """
-    assert(y_true.shape == 1) # should be 1D, each index is obs true label
+    """Top k Accuracy classification score."""
+
+    if len(y_true.shape) == 2:
+        y_true = y_true[0] # should be one-dimensional
 
     num_obs, num_labels = y_pred.shape