Sample single scale per GPU in LSJ data augmentation.

Eric Mintun · facebook-github-bot · commit 5b53d8a23ca6 · 2021-07-26T13:56:23.000-07:00
Summary: Code for sampling only one scale per minibatch in the LSJ data augmentation. `dataloader.train.mode` can be set to `per_image` to sample a separate scale for every image, `per_gpu` to sample a scale for each GPU in each minibatch, or `per_batch` to sample a single scale across all GPUs for each minibatch. Padding up to 1024x1024 is removed so that small scales can run faster. The implementation follows that from [A Multigrid Method for Efficiently Training Video Models](https://openaccess.thecvf.com/content_CVPR_2020/papers/Wu_A_Multigrid_Method_for_Efficiently_Training_Video_Models_CVPR_2020_paper.pdf]): the batch sampler samples scales in the desired mode, then for each image returns a tuple `(index, scale)`. The dataset's `__getitem__` is modified to take such a tuple as input instead of just an index. When sampled this way: 1) Padding is slightly different; it is done by the model (as is true for non-LSJ data augmentation) and not the augmentation. This means padding occurs after horizontal flipping instead of before, and uses a slightly different value (0.0 after normalization, instead of 128 before normalization). 2) Grouping by aspect ratio is turned off, since it is now detrimental. Reviewed By: vaibhava0 Differential Revision: D29506785 fbshipit-source-id: b252f21668f1508618d127133b64e28b17f48fd6
diff --git a/detectron2/data/dataset_mapper.py b/detectron2/data/dataset_mapper.py
@@ -112,6 +112,35 @@ def from_config(cls, cfg, is_train: bool = True):
             )
         return ret
 
+    def _transform_annotations(self, dataset_dict, transforms, image_shape):
+        # USER: Modify this if you want to keep them for some reason.
+        for anno in dataset_dict["annotations"]:
+            if not self.use_instance_mask:
+                anno.pop("segmentation", None)
+            if not self.use_keypoint:
+                anno.pop("keypoints", None)
+
+        # USER: Implement additional transformations if you have other types of data
+        annos = [
+            utils.transform_instance_annotations(
+                obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+            )
+            for obj in dataset_dict.pop("annotations")
+            if obj.get("iscrowd", 0) == 0
+        ]
+        instances = utils.annotations_to_instances(
+            annos, image_shape, mask_format=self.instance_mask_format
+        )
+
+        # After transforms such as cropping are applied, the bounding box may no longer
+        # tightly bound the object. As an example, imagine a triangle object
+        # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
+        # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
+        # the intersection of original bounding box and the cropping box.
+        if self.recompute_boxes:
+            instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+        dataset_dict["instances"] = utils.filter_empty_instances(instances)
+
     def __call__(self, dataset_dict):
         """
         Args:
@@ -157,31 +186,6 @@ def __call__(self, dataset_dict):
             return dataset_dict
 
         if "annotations" in dataset_dict:
-            # USER: Modify this if you want to keep them for some reason.
-            for anno in dataset_dict["annotations"]:
-                if not self.use_instance_mask:
-                    anno.pop("segmentation", None)
-                if not self.use_keypoint:
-                    anno.pop("keypoints", None)
-
-            # USER: Implement additional transformations if you have other types of data
-            annos = [
-                utils.transform_instance_annotations(
-                    obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
-                )
-                for obj in dataset_dict.pop("annotations")
-                if obj.get("iscrowd", 0) == 0
-            ]
-            instances = utils.annotations_to_instances(
-                annos, image_shape, mask_format=self.instance_mask_format
-            )
+            self._transform_annotations(dataset_dict, transforms, image_shape)
 
-            # After transforms such as cropping are applied, the bounding box may no longer
-            # tightly bound the object. As an example, imagine a triangle object
-            # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
-            # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
-            # the intersection of original bounding box and the cropping box.
-            if self.recompute_boxes:
-                instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
-            dataset_dict["instances"] = utils.filter_empty_instances(instances)
         return dataset_dict
diff --git a/detectron2/data/transforms/augmentation_impl.py b/detectron2/data/transforms/augmentation_impl.py
@@ -206,20 +206,27 @@ def __init__(
         super().__init__()
         self._init(locals())
 
-    def get_transform(self, image: np.ndarray) -> Transform:
-        # Compute the image scale and scaled size.
+    def _get_resize(self, image: np.ndarray, scale: float) -> Transform:
         input_size = image.shape[:2]
-        output_size = (self.target_height, self.target_width)
-        random_scale = np.random.uniform(self.min_scale, self.max_scale)
-        random_scale_size = np.multiply(output_size, random_scale)
-        scale = np.minimum(
-            random_scale_size[0] / input_size[0], random_scale_size[1] / input_size[1]
+
+        # Compute new target size given a scale.
+        target_size = (self.target_height, self.target_width)
+        target_scale_size = np.multiply(target_size, scale)
+
+        # Compute actual rescaling applied to input image and output size.
+        output_scale = np.minimum(
+            target_scale_size[0] / input_size[0], target_scale_size[1] / input_size[1]
         )
-        scaled_size = np.round(np.multiply(input_size, scale)).astype(int)
+        output_size = np.round(np.multiply(input_size, output_scale)).astype(int)
+
         return ResizeTransform(
-            input_size[0], input_size[1], scaled_size[0], scaled_size[1], self.interp
+            input_size[0], input_size[1], output_size[0], output_size[1], self.interp
         )
 
+    def get_transform(self, image: np.ndarray) -> Transform:
+        random_scale = np.random.uniform(self.min_scale, self.max_scale)
+        return self._get_resize(image, random_scale)
+
 
 class RandomRotation(Augmentation):
     """
@@ -279,19 +286,21 @@ class FixedSizeCrop(Augmentation):
     """
     If `crop_size` is smaller than the input image size, then it uses a random crop of
     the crop size. If `crop_size` is larger than the input image size, then it pads
-    the right and the bottom of the image to the crop size.
+    the right and the bottom of the image to the crop size if `pad` is True, otherwise
+    it returns the smaller image.
     """
 
-    def __init__(self, crop_size: Tuple[int], pad_value: float = 128.0):
+    def __init__(self, crop_size: Tuple[int], pad: bool = True, pad_value: float = 128.0):
         """
         Args:
             crop_size: target image (height, width).
+            pad: if True, will pad images smaller than `crop_size` up to `crop_size`
             pad_value: the padding value.
         """
         super().__init__()
         self._init(locals())
 
-    def get_transform(self, image: np.ndarray) -> TransformList:
+    def _get_crop(self, image: np.ndarray) -> Transform:
         # Compute the image scale and scaled size.
         input_size = image.shape[:2]
         output_size = self.crop_size
@@ -301,19 +310,28 @@ def get_transform(self, image: np.ndarray) -> TransformList:
         max_offset = np.maximum(max_offset, 0)
         offset = np.multiply(max_offset, np.random.uniform(0.0, 1.0))
         offset = np.round(offset).astype(int)
-        crop_transform = CropTransform(
+        return CropTransform(
             offset[1], offset[0], output_size[1], output_size[0], input_size[1], input_size[0]
         )
 
+    def _get_pad(self, image: np.ndarray) -> Transform:
+        # Compute the image scale and scaled size.
+        input_size = image.shape[:2]
+        output_size = self.crop_size
+
         # Add padding if the image is scaled down.
         pad_size = np.subtract(output_size, input_size)
         pad_size = np.maximum(pad_size, 0)
         original_size = np.minimum(input_size, output_size)
-        pad_transform = PadTransform(
+        return PadTransform(
             0, 0, pad_size[1], pad_size[0], original_size[1], original_size[0], self.pad_value
         )
 
-        return TransformList([crop_transform, pad_transform])
+    def get_transform(self, image: np.ndarray) -> TransformList:
+        transforms = [self._get_crop(image)]
+        if self.pad:
+            transforms.append(self._get_pad(image))
+        return TransformList(transforms)
 
 
 class RandomCrop(Augmentation):