Update perspective_bounding_boxes for rotated boxes

AntoineSimoulin · AntoineSimoulin · commit 580ae4bb376b · 2025-06-10T14:48:56.000-07:00
Test Plan:
```bash
pytest test/test_transforms_v2.py -vvv -k "TestPerspective and test_kernel_bounding_boxes"

pytest test/test_transforms_v2.py -vvv -k "TestPerspective and test_correctness_perspective_bounding_boxes"
```
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
@@ -647,6 +647,183 @@ def affine_rotated_bounding_boxes(bounding_boxes):
     )
 
 
+def reference_perspective_bounding_boxes(bounding_boxes, *, coefficients, new_canvas_size=None, clamp=True):
+    format = bounding_boxes.format
+    canvas_size = new_canvas_size or bounding_boxes.canvas_size
+
+    def perspective_bounding_boxes(bounding_boxes):
+        dtype = bounding_boxes.dtype
+        device = bounding_boxes.device
+        m1 = np.array(
+            [
+                [coefficients[0], coefficients[1], coefficients[2]],
+                [coefficients[3], coefficients[4], coefficients[5]],
+            ]
+        )
+        m2 = np.array(
+            [
+                [coefficients[6], coefficients[7], 1.0],
+                [coefficients[6], coefficients[7], 1.0],
+            ]
+        )
+
+        # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
+        input_xyxy = F.convert_bounding_box_format(
+            bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True),
+            old_format=format,
+            new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            inplace=True,
+        )
+        x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist()
+
+        points = np.array(
+            [
+                [x1, y1, 1.0],
+                [x2, y1, 1.0],
+                [x1, y2, 1.0],
+                [x2, y2, 1.0],
+            ]
+        )
+
+        numerator = points @ m1.T
+        denominator = points @ m2.T
+        transformed_points = numerator / denominator
+
+        output_xyxy = torch.Tensor(
+            [
+                float(np.min(transformed_points[:, 0])),
+                float(np.min(transformed_points[:, 1])),
+                float(np.max(transformed_points[:, 0])),
+                float(np.max(transformed_points[:, 1])),
+            ]
+        )
+
+        output = F.convert_bounding_box_format(
+            output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format
+        )
+
+        if clamp:
+            # It is important to clamp before casting, especially for CXCYWHR format, dtype=int64
+            output = F.clamp_bounding_boxes(
+                output,
+                format=format,
+                canvas_size=canvas_size,
+            )
+        else:
+            # We leave the bounding box as float32 so the caller gets the full precision to perform any additional
+            # operation
+            dtype = output.dtype
+
+        return output.to(dtype=dtype, device=device)
+
+    return tv_tensors.BoundingBoxes(
+        torch.cat([perspective_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape(
+            bounding_boxes.shape
+        ),
+        format=format,
+        canvas_size=canvas_size,
+    )
+
+
+def reference_perspective_rotated_bounding_boxes(bounding_boxes, *, coefficients, new_canvas_size=None, clamp=True):
+    format = bounding_boxes.format
+    canvas_size = new_canvas_size or bounding_boxes.canvas_size
+
+    def perspective_rotated_bounding_boxes(bounding_boxes):
+        dtype = bounding_boxes.dtype
+        device = bounding_boxes.device
+        m1 = np.array(
+            [
+                [coefficients[0], coefficients[1], coefficients[2]],
+                [coefficients[3], coefficients[4], coefficients[5]],
+            ]
+        )
+        m2 = np.array(
+            [
+                [coefficients[6], coefficients[7], 1.0],
+                [coefficients[6], coefficients[7], 1.0],
+            ]
+        )
+
+        # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
+        input_xyxyxyxy = F.convert_bounding_box_format(
+            bounding_boxes.to(device="cpu", copy=True),
+            old_format=format,
+            new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY,
+            inplace=True,
+        )
+        x1, y1, x2, y2, x3, y3, x4, y4 = input_xyxyxyxy.squeeze(0).tolist()
+
+        points = np.array(
+            [
+                [x1, y1, 1.0],
+                [x2, y2, 1.0],
+                [x3, y3, 1.0],
+                [x4, y4, 1.0],
+            ]
+        )
+
+        numerator = points @ m1.astype(points.dtype).T
+        denominator = points @ m2.astype(points.dtype).T
+        transformed_points = numerator / denominator
+
+        output = torch.Tensor(
+            [
+                float(transformed_points[0, 0]),
+                float(transformed_points[0, 1]),
+                float(transformed_points[1, 0]),
+                float(transformed_points[1, 1]),
+                float(transformed_points[2, 0]),
+                float(transformed_points[2, 1]),
+                float(transformed_points[3, 0]),
+                float(transformed_points[3, 1]),
+            ]
+        )
+        output = _parallelogram_to_bounding_boxes(output)
+
+        output = F.convert_bounding_box_format(
+            output, old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, new_format=format
+        )
+
+        if torch.is_floating_point(output) and dtype in (
+            torch.uint8,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+        ):
+            # it is better to round before cast
+            output = torch.round(output)
+
+        if clamp:
+            # It is important to clamp before casting, especially for CXCYWHR format, dtype=int64
+            output = F.clamp_bounding_boxes(
+                output,
+                format=format,
+                canvas_size=canvas_size,
+            )
+        else:
+            # We leave the bounding box as float32 so the caller gets the full precision to perform any additional
+            # operation
+            dtype = output.dtype
+
+        return output.to(dtype=dtype, device=device)
+
+    return tv_tensors.BoundingBoxes(
+        torch.cat(
+            [
+                perspective_rotated_bounding_boxes(b)
+                for b in bounding_boxes.reshape(
+                    -1, 5 if format != tv_tensors.BoundingBoxFormat.XYXYXYXY else 8
+                ).unbind()
+            ],
+            dim=0,
+        ).reshape(bounding_boxes.shape),
+        format=format,
+        canvas_size=canvas_size,
+    )
+
+
 class TestResize:
     INPUT_SIZE = (17, 11)
     OUTPUT_SIZES = [17, [17], (17,), None, [12, 13], (12, 13)]
@@ -4259,7 +4436,7 @@ def test_kernel_image_error(self):
         coefficients=COEFFICIENTS,
         start_end_points=START_END_POINTS,
     )
-    @pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     def test_kernel_bounding_boxes(self, param, value, format):
         if param == "start_end_points":
             kwargs = dict(zip(["startpoints", "endpoints"], value))
@@ -4363,79 +4540,18 @@ def test_image_functional_correctness(self, coefficients, interpolation, fill):
             assert_equal(actual, expected)
 
     def _reference_perspective_bounding_boxes(self, bounding_boxes, *, startpoints, endpoints):
-        format = bounding_boxes.format
-        canvas_size = bounding_boxes.canvas_size
-        dtype = bounding_boxes.dtype
-        device = bounding_boxes.device
 
         coefficients = _get_perspective_coeffs(endpoints, startpoints)
 
-        def perspective_bounding_boxes(bounding_boxes):
-            m1 = np.array(
-                [
-                    [coefficients[0], coefficients[1], coefficients[2]],
-                    [coefficients[3], coefficients[4], coefficients[5]],
-                ]
-            )
-            m2 = np.array(
-                [
-                    [coefficients[6], coefficients[7], 1.0],
-                    [coefficients[6], coefficients[7], 1.0],
-                ]
-            )
-
-            # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
-            input_xyxy = F.convert_bounding_box_format(
-                bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True),
-                old_format=format,
-                new_format=tv_tensors.BoundingBoxFormat.XYXY,
-                inplace=True,
-            )
-            x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist()
-
-            points = np.array(
-                [
-                    [x1, y1, 1.0],
-                    [x2, y1, 1.0],
-                    [x1, y2, 1.0],
-                    [x2, y2, 1.0],
-                ]
-            )
-
-            numerator = points @ m1.T
-            denominator = points @ m2.T
-            transformed_points = numerator / denominator
-
-            output_xyxy = torch.Tensor(
-                [
-                    float(np.min(transformed_points[:, 0])),
-                    float(np.min(transformed_points[:, 1])),
-                    float(np.max(transformed_points[:, 0])),
-                    float(np.max(transformed_points[:, 1])),
-                ]
-            )
-
-            output = F.convert_bounding_box_format(
-                output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format
-            )
-
-            # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
-            return F.clamp_bounding_boxes(
-                output,
-                format=format,
-                canvas_size=canvas_size,
-            ).to(dtype=dtype, device=device)
-
-        return tv_tensors.BoundingBoxes(
-            torch.cat([perspective_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape(
-                bounding_boxes.shape
-            ),
-            format=format,
-            canvas_size=canvas_size,
+        helper = (
+            reference_perspective_rotated_bounding_boxes
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_perspective_bounding_boxes
         )
+        return helper(bounding_boxes, coefficients=coefficients)
 
     @pytest.mark.parametrize(("startpoints", "endpoints"), START_END_POINTS)
-    @pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_correctness_perspective_bounding_boxes(self, startpoints, endpoints, format, dtype, device):
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
@@ -1753,10 +1753,13 @@ def perspective_bounding_boxes(
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
 
     original_shape = bounding_boxes.shape
+    original_dtype = bounding_boxes.dtype
+    is_rotated = tv_tensors.is_rotated_bounding_format(format)
+    intermediate_format = tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY
     # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
     bounding_boxes = (
-        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY)
-    ).reshape(-1, 4)
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=intermediate_format)
+    ).reshape(-1, 8 if is_rotated else 4)
 
     dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
     device = bounding_boxes.device
@@ -1805,7 +1808,8 @@ def perspective_bounding_boxes(
     # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
     # Single point structure is similar to
     # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
-    points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
+    points = bounding_boxes if is_rotated else bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]]
+    points = points.reshape(-1, 2)
     points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
     # 2) Now let's transform the points using perspective matrices
     #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
@@ -1817,21 +1821,23 @@ def perspective_bounding_boxes(
 
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
     # and compute bounding box from 4 transformed points:
-    transformed_points = transformed_points.reshape(-1, 4, 2)
-    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
-
-    out_bboxes = clamp_bounding_boxes(
-        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
-        format=tv_tensors.BoundingBoxFormat.XYXY,
-        canvas_size=canvas_size,
-    )
+    if is_rotated:
+        transformed_points = transformed_points.reshape(-1, 8)
+        out_bboxes = _parallelogram_to_bounding_boxes(transformed_points)
+    else:
+        transformed_points = transformed_points.reshape(-1, 4, 2)
+        out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+        out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
 
-    # out_bboxes should be of shape [N boxes, 4]
+    out_bboxes = clamp_bounding_boxes(out_bboxes, format=intermediate_format, canvas_size=canvas_size)
 
-    return convert_bounding_box_format(
-        out_bboxes, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    out_bboxes = convert_bounding_box_format(
+        out_bboxes, old_format=intermediate_format, new_format=format, inplace=True
     ).reshape(original_shape)
 
+    out_bboxes = out_bboxes.to(original_dtype)
+    return out_bboxes
+
 
 @_register_kernel_internal(perspective, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _perspective_bounding_boxes_dispatch(