improve Coco implementation (#3417)

pmeier · fmassa · web-flow · commit 62e185c7ee6d · 2021-02-22T14:15:16.000+01:00
Co-authored-by: Francisco Massa &lt;fvsmassa@gmail.com&gt;
diff --git a/torchvision/datasets/coco.py b/torchvision/datasets/coco.py
@@ -2,11 +2,11 @@
 from PIL import Image
 import os
 import os.path
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Tuple, List
 
 
-class CocoCaptions(VisionDataset):
-    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
+class CocoDetection(VisionDataset):
+    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
 
     Args:
         root (string): Root directory where images are downloaded to.
@@ -17,77 +17,45 @@ class CocoCaptions(VisionDataset):
             target and transforms it.
         transforms (callable, optional): A function/transform that takes input sample and its target as entry
             and returns a transformed version.
-
-    Example:
-
-        .. code:: python
-
-            import torchvision.datasets as dset
-            import torchvision.transforms as transforms
-            cap = dset.CocoCaptions(root = 'dir where images are',
-                                    annFile = 'json annotation file',
-                                    transform=transforms.ToTensor())
-
-            print('Number of samples: ', len(cap))
-            img, target = cap[3] # load 4th sample
-
-            print("Image Size: ", img.size())
-            print(target)
-
-        Output: ::
-
-            Number of samples: 82783
-            Image Size: (3L, 427L, 640L)
-            [u'A plane emitting smoke stream flying over a mountain.',
-            u'A plane darts across a bright blue sky behind a mountain covered in snow',
-            u'A plane leaves a contrail above the snowy mountain top.',
-            u'A mountain that has a plane flying overheard in the distance.',
-            u'A mountain view with a plume of smoke in the background']
-
     """
 
     def __init__(
-            self,
-            root: str,
-            annFile: str,
-            transform: Optional[Callable] = None,
-            target_transform: Optional[Callable] = None,
-            transforms: Optional[Callable] = None,
-    ) -> None:
-        super(CocoCaptions, self).__init__(root, transforms, transform, target_transform)
+        self,
+        root: str,
+        annFile: str,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        transforms: Optional[Callable] = None,
+    ):
+        super().__init__(root, transforms, transform, target_transform)
         from pycocotools.coco import COCO
+
         self.coco = COCO(annFile)
         self.ids = list(sorted(self.coco.imgs.keys()))
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            tuple: Tuple (image, target). target is a list of captions for the image.
-        """
-        coco = self.coco
-        img_id = self.ids[index]
-        ann_ids = coco.getAnnIds(imgIds=img_id)
-        anns = coco.loadAnns(ann_ids)
-        target = [ann['caption'] for ann in anns]
+    def _load_image(self, id: int) -> Image.Image:
+        path = self.coco.loadImgs(id)[0]["file_name"]
+        return Image.open(os.path.join(self.root, path)).convert("RGB")
 
-        path = coco.loadImgs(img_id)[0]['file_name']
+    def _load_target(self, id) -> List[Any]:
+        return self.coco.loadAnns(self.coco.getAnnIds(id))
 
-        img = Image.open(os.path.join(self.root, path)).convert('RGB')
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        id = self.ids[index]
+        image = self._load_image(id)
+        target = self._load_target(id)
 
         if self.transforms is not None:
-            img, target = self.transforms(img, target)
+            image, target = self.transforms(image, target)
 
-        return img, target
+        return image, target
 
     def __len__(self) -> int:
         return len(self.ids)
 
 
-class CocoDetection(VisionDataset):
-    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
+class CocoCaptions(CocoDetection):
+    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
 
     Args:
         root (string): Root directory where images are downloaded to.
@@ -98,41 +66,34 @@ class CocoDetection(VisionDataset):
             target and transforms it.
         transforms (callable, optional): A function/transform that takes input sample and its target as entry
             and returns a transformed version.
-    """
 
-    def __init__(
-            self,
-            root: str,
-            annFile: str,
-            transform: Optional[Callable] = None,
-            target_transform: Optional[Callable] = None,
-            transforms: Optional[Callable] = None,
-    ) -> None:
-        super(CocoDetection, self).__init__(root, transforms, transform, target_transform)
-        from pycocotools.coco import COCO
-        self.coco = COCO(annFile)
-        self.ids = list(sorted(self.coco.imgs.keys()))
+    Example:
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        """
-        Args:
-            index (int): Index
+        .. code:: python
+
+            import torchvision.datasets as dset
+            import torchvision.transforms as transforms
+            cap = dset.CocoCaptions(root = 'dir where images are',
+                                    annFile = 'json annotation file',
+                                    transform=transforms.ToTensor())
 
-        Returns:
-            tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
-        """
-        coco = self.coco
-        img_id = self.ids[index]
-        ann_ids = coco.getAnnIds(imgIds=img_id)
-        target = coco.loadAnns(ann_ids)
+            print('Number of samples: ', len(cap))
+            img, target = cap[3] # load 4th sample
 
-        path = coco.loadImgs(img_id)[0]['file_name']
+            print("Image Size: ", img.size())
+            print(target)
 
-        img = Image.open(os.path.join(self.root, path)).convert('RGB')
-        if self.transforms is not None:
-            img, target = self.transforms(img, target)
+        Output: ::
 
-        return img, target
+            Number of samples: 82783
+            Image Size: (3L, 427L, 640L)
+            [u'A plane emitting smoke stream flying over a mountain.',
+            u'A plane darts across a bright blue sky behind a mountain covered in snow',
+            u'A plane leaves a contrail above the snowy mountain top.',
+            u'A mountain that has a plane flying overheard in the distance.',
+            u'A mountain view with a plume of smoke in the background']
 
-    def __len__(self) -> int:
-        return len(self.ids)
+    """
+
+    def _load_target(self, id) -> List[str]:
+        return [ann["caption"] for ann in super()._load_target(id)]