support square padding in backbone

lyttonhao · facebook-github-bot · commit 211efad5f0c2 · 2022-05-19T20:28:17.000-07:00
Summary: X-link: facebookresearch/d2go#258 Support square padding case in backbone. Reviewed By: wat3rBro Differential Revision: D35552076 fbshipit-source-id: e4f7f4da62b6ee9b71686071ff6cf2747ecc90e0
diff --git a/detectron2/modeling/backbone/backbone.py b/detectron2/modeling/backbone/backbone.py
@@ -1,5 +1,6 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 from abc import ABCMeta, abstractmethod
+from typing import Dict
 import torch.nn as nn
 
 from detectron2.layers import ShapeSpec
@@ -39,6 +40,26 @@ def size_divisibility(self) -> int:
         """
         return 0
 
+    @property
+    def padding_constraints(self) -> Dict[str, int]:
+        """
+        This property is a generalization of size_divisibility. Some backbones and training
+        recipes require specific padding constraints, such as enforcing divisibility by a specific
+        integer (e.g., FPN) or padding to a square (e.g., ViTDet with large-scale jitter
+        in :paper:vitdet). `padding_constraints` contains these optional items like:
+        {
+            "size_divisibility": int,
+            "square": int,
+            # Future options are possible
+        }
+        `size_divisibility` will read from here if presented and `square` indicates if requiring
+        inputs to be padded to square. Set to None if no specific padding constraints.
+
+        TODO: use type of Dict[str, int] to avoid torchscipt issues. The type of padding_constraints
+        could be generalized as TypedDict (Python 3.8+) to support more types in the future.
+        """
+        return {}
+
     def output_shape(self):
         """
         Returns:
diff --git a/detectron2/modeling/backbone/fpn.py b/detectron2/modeling/backbone/fpn.py
@@ -23,7 +23,14 @@ class FPN(Backbone):
     _fuse_type: torch.jit.Final[str]
 
     def __init__(
-        self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum"
+        self,
+        bottom_up,
+        in_features,
+        out_channels,
+        norm="",
+        top_block=None,
+        fuse_type="sum",
+        square_pad=False,
     ):
         """
         Args:
@@ -103,13 +110,18 @@ def __init__(
         self._out_features = list(self._out_feature_strides.keys())
         self._out_feature_channels = {k: out_channels for k in self._out_features}
         self._size_divisibility = strides[-1]
+        self._square_pad = square_pad
         assert fuse_type in {"avg", "sum"}
         self._fuse_type = fuse_type
 
     @property
     def size_divisibility(self):
         return self._size_divisibility
 
+    @property
+    def padding_constraints(self):
+        return {"square": int(self._square_pad)}
+
     def forward(self, x):
         """
         Args:
diff --git a/detectron2/modeling/meta_arch/dense_detector.py b/detectron2/modeling/meta_arch/dense_detector.py
@@ -62,7 +62,6 @@ def __init__(
             self.head_in_features = sorted(shapes.keys(), key=lambda x: shapes[x].stride)
         else:
             self.head_in_features = head_in_features
-
         self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
         self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
 
@@ -127,7 +126,11 @@ def preprocess_image(self, batched_inputs: List[Dict[str, Tensor]]):
         """
         images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
         images = [(x - self.pixel_mean) / self.pixel_std for x in images]
-        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
         return images
 
     def _transpose_dense_predictions(
diff --git a/detectron2/modeling/meta_arch/panoptic_fpn.py b/detectron2/modeling/meta_arch/panoptic_fpn.py
@@ -119,7 +119,10 @@ def forward(self, batched_inputs):
         assert "sem_seg" in batched_inputs[0]
         gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
         gt_sem_seg = ImageList.from_tensors(
-            gt_sem_seg, self.backbone.size_divisibility, self.sem_seg_head.ignore_value
+            gt_sem_seg,
+            self.backbone.size_divisibility,
+            self.sem_seg_head.ignore_value,
+            self.backbone.padding_constraints,
         ).tensor
         sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg)
 
diff --git a/detectron2/modeling/meta_arch/rcnn.py b/detectron2/modeling/meta_arch/rcnn.py
@@ -227,7 +227,11 @@ def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]):
         """
         images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
         images = [(x - self.pixel_mean) / self.pixel_std for x in images]
-        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
         return images
 
     @staticmethod
@@ -305,7 +309,11 @@ def forward(self, batched_inputs):
         """
         images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
         images = [(x - self.pixel_mean) / self.pixel_std for x in images]
-        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
         features = self.backbone(images.tensor)
 
         if "instances" in batched_inputs[0]:
diff --git a/detectron2/modeling/meta_arch/semantic_seg.py b/detectron2/modeling/meta_arch/semantic_seg.py
@@ -99,14 +99,21 @@ def forward(self, batched_inputs):
         """
         images = [x["image"].to(self.device) for x in batched_inputs]
         images = [(x - self.pixel_mean) / self.pixel_std for x in images]
-        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
 
         features = self.backbone(images.tensor)
 
         if "sem_seg" in batched_inputs[0]:
             targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
             targets = ImageList.from_tensors(
-                targets, self.backbone.size_divisibility, self.sem_seg_head.ignore_value
+                targets,
+                self.backbone.size_divisibility,
+                self.sem_seg_head.ignore_value,
+                self.backbone.padding_constraints,
             ).tensor
         else:
             targets = None
diff --git a/detectron2/modeling/postprocessing.py b/detectron2/modeling/postprocessing.py
@@ -23,7 +23,6 @@ def detector_postprocess(
             `results.image_size` contains the input image resolution the detector sees.
             This object might be modified in-place.
         output_height, output_width: the desired output resolution.
-
     Returns:
         Instances: the resized output from the model, based on the output resolution
     """
diff --git a/detectron2/structures/image_list.py b/detectron2/structures/image_list.py
@@ -1,6 +1,6 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 from __future__ import division
-from typing import Any, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 import torch
 from torch import device
 from torch.nn import functional as F
@@ -57,7 +57,10 @@ def device(self) -> device:
 
     @staticmethod
     def from_tensors(
-        tensors: List[torch.Tensor], size_divisibility: int = 0, pad_value: float = 0.0
+        tensors: List[torch.Tensor],
+        size_divisibility: int = 0,
+        pad_value: float = 0.0,
+        padding_constraints: Optional[Dict[str, int]] = None,
     ) -> "ImageList":
         """
         Args:
@@ -67,7 +70,11 @@ def from_tensors(
             size_divisibility (int): If `size_divisibility > 0`, add padding to ensure
                 the common height and width is divisible by `size_divisibility`.
                 This depends on the model and many models need a divisibility of 32.
-            pad_value (float): value to pad
+            pad_value (float): value to pad.
+            padding_constraints (optional[Dict]): If given, it would follow the format as
+                {"size_divisibility": int, "square": int}, where `size_divisibility` will overwrite
+                the above one if presented and `square` indicates if require inputs to be padded to
+                square.
 
         Returns:
             an `ImageList`.
@@ -82,6 +89,12 @@ def from_tensors(
         image_sizes_tensor = [shapes_to_tensor(x) for x in image_sizes]
         max_size = torch.stack(image_sizes_tensor).max(0).values
 
+        if padding_constraints is not None:
+            if padding_constraints.get("square", 0) > 0:
+                # pad to square.
+                max_size[0] = max_size[1] = max_size.max()
+            if "size_divisibility" in padding_constraints:
+                size_divisibility = padding_constraints["size_divisibility"]
         if size_divisibility > 1:
             stride = size_divisibility
             # the last two dims are H,W, both subject to divisibility requirement