Fix PR comments for rorated box transforms

AntoineSimoulin · AntoineSimoulin · commit 36b02dd94641 · 2025-05-29T12:48:12.000-07:00
diff --git a/test/assets/fakedata/draw_rotated_boxes.png b/test/assets/fakedata/draw_rotated_boxes.png
diff --git a/test/common_utils.py b/test/common_utils.py
@@ -444,13 +444,13 @@ def sample_position(values, max_value):
         r_rad = r * torch.pi / 180.0
         cos, sin = torch.cos(r_rad), torch.sin(r_rad)
         x1, y1 = x, y
-        x3 = x1 + w * cos
-        y3 = y1 - w * sin
-        x2 = x3 + h * sin
-        y2 = y3 + h * cos
+        x2 = x1 + w * cos
+        y2 = y1 - w * sin
+        x3 = x2 + h * sin
+        y3 = y2 + h * cos
         x4 = x1 + h * sin
         y4 = y1 + h * cos
-        parts = (x1, y1, x3, y3, x2, y2, x4, y4)
+        parts = (x1, y1, x2, y2, x3, y3, x4, y4)
     else:
         raise ValueError(f"Format {format} is not supported")
 
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
@@ -575,13 +575,13 @@ def affine_rotated_bounding_boxes(bounding_boxes):
             new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY,
             inplace=True,
         )
-        x1, y1, x3, y3, x2, y2, x4, y4 = input_xyxyxyxy.squeeze(0).tolist()
+        x1, y1, x2, y2, x3, y3, x4, y4 = input_xyxyxyxy.squeeze(0).tolist()
 
         points = np.array(
             [
                 [x1, y1, 1.0],
-                [x3, y3, 1.0],
                 [x2, y2, 1.0],
+                [x3, y3, 1.0],
                 [x4, y4, 1.0],
             ]
         )
@@ -604,14 +604,14 @@ def affine_rotated_bounding_boxes(bounding_boxes):
         )
 
         if clamp:
-            # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+            # It is important to clamp before casting, especially for CXCYWHR format, dtype=int64
             output = F.clamp_bounding_boxes(
                 output,
                 format=format,
                 canvas_size=canvas_size,
             )
         else:
-            # We leave the bounding box as float64 so the caller gets the full precision to perform any additional
+            # We leave the bounding box as float32 so the caller gets the full precision to perform any additional
             # operation
             dtype = output.dtype
 
@@ -1143,17 +1143,20 @@ def test_image_correctness(self, fn):
 
         torch.testing.assert_close(actual, expected)
 
-    def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes, format):
+    def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes):
         affine_matrix = np.array(
             [
                 [-1, 0, bounding_boxes.canvas_size[1]],
                 [0, 1, 0],
             ],
         )
 
-        if tv_tensors.is_rotated_bounding_format(format):
-            return reference_affine_rotated_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
-        return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+        return helper(bounding_boxes, affine_matrix=affine_matrix)
 
     @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize(
@@ -1163,7 +1166,7 @@ def test_bounding_boxes_correctness(self, format, fn):
         bounding_boxes = make_bounding_boxes(format=format)
 
         actual = fn(bounding_boxes)
-        expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes, format)
+        expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes)
 
         torch.testing.assert_close(actual, expected)
 
@@ -1595,25 +1598,28 @@ def test_image_correctness(self, fn):
 
         torch.testing.assert_close(actual, expected)
 
-    def _reference_vertical_flip_bounding_boxes(self, bounding_boxes, format):
+    def _reference_vertical_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes):
         affine_matrix = np.array(
             [
                 [1, 0, 0],
                 [0, -1, bounding_boxes.canvas_size[0]],
             ],
         )
 
-        if tv_tensors.is_rotated_bounding_format(format):
-            return reference_affine_rotated_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
-        return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+        return helper(bounding_boxes, affine_matrix=affine_matrix)
 
     @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_bounding_boxes_correctness(self, format, fn):
         bounding_boxes = make_bounding_boxes(format=format)
 
         actual = fn(bounding_boxes)
-        expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes, format)
+        expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes)
 
         torch.testing.assert_close(actual, expected)
 
diff --git a/test/test_tv_tensors.py b/test/test_tv_tensors.py
@@ -44,69 +44,31 @@ def test_bbox_instance(data, format):
 
 
 @pytest.mark.parametrize(
-    "format",
+    "format, is_rotated_expected",
     [
-        "XYXY",
-        "XYWH",
-        "CXCYWH",
-        "XYXYXYXY",
-        "XYWHR",
-        "CXCYWHR",
-        tv_tensors.BoundingBoxFormat.XYXY,
-        tv_tensors.BoundingBoxFormat.XYWH,
-        tv_tensors.BoundingBoxFormat.CXCYWH,
-        tv_tensors.BoundingBoxFormat.XYXYXYXY,
-        tv_tensors.BoundingBoxFormat.XYWHR,
-        tv_tensors.BoundingBoxFormat.CXCYWHR,
+        ("XYXY", False),
+        ("XYWH", False),
+        ("CXCYWH", False),
+        ("XYXYXYXY", True),
+        ("XYWHR", True),
+        ("CXCYWHR", True),
+        (tv_tensors.BoundingBoxFormat.XYXY, False),
+        (tv_tensors.BoundingBoxFormat.XYWH, False),
+        (tv_tensors.BoundingBoxFormat.CXCYWH, False),
+        (tv_tensors.BoundingBoxFormat.XYXYXYXY, True),
+        (tv_tensors.BoundingBoxFormat.XYWHR, True),
+        (tv_tensors.BoundingBoxFormat.CXCYWHR, True),
     ],
 )
-def test_bbox_format(format):
+@pytest.mark.parametrize("scripted", (False, True))
+def test_bbox_format(format, is_rotated_expected, scripted):
     if isinstance(format, str):
         format = tv_tensors.BoundingBoxFormat[(format.upper())]
-    if format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
-        assert tv_tensors.is_rotated_bounding_format(format) is True
-    elif format == tv_tensors.BoundingBoxFormat.XYWHR:
-        assert tv_tensors.is_rotated_bounding_format(format) is True
-    elif format == tv_tensors.BoundingBoxFormat.CXCYWHR:
-        assert tv_tensors.is_rotated_bounding_format(format) is True
-    else:
-        assert tv_tensors.is_rotated_bounding_format(format) is False
 
-
-@pytest.mark.parametrize(
-    "format",
-    [
-        "XYXY",
-        "XYWH",
-        "CXCYWH",
-        "XYXYXYXY",
-        "XYWHR",
-        "CXCYWHR",
-        tv_tensors.BoundingBoxFormat.XYXY,
-        tv_tensors.BoundingBoxFormat.XYWH,
-        tv_tensors.BoundingBoxFormat.CXCYWH,
-        tv_tensors.BoundingBoxFormat.XYXYXYXY,
-        tv_tensors.BoundingBoxFormat.XYWHR,
-        tv_tensors.BoundingBoxFormat.CXCYWHR,
-    ],
-)
-def test_bbox_format_scripted(format):
-    obj = tv_tensors.is_rotated_bounding_format
-    try:
-        fn = torch.jit.script(obj)
-    except Exception as error:
-        name = getattr(obj, "__name__", obj.__class__.__name__)
-        raise AssertionError(f"Trying to `torch.jit.script` `{name}` raised the error above.") from error
-    if isinstance(format, str):
-        format = tv_tensors.BoundingBoxFormat[(format.upper())]
-    if format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
-        assert fn(format) is True
-    elif format == tv_tensors.BoundingBoxFormat.XYWHR:
-        assert fn(format) is True
-    elif format == tv_tensors.BoundingBoxFormat.CXCYWHR:
-        assert fn(format) is True
-    else:
-        assert fn(format) is False
+    fn = tv_tensors.is_rotated_bounding_format
+    if scripted:
+        fn = torch.jit.script(fn)
+    assert fn(format) == is_rotated_expected
 
 
 def test_bbox_dim_error():
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -17,7 +17,14 @@
 PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION.split("."))
 
 boxes = torch.tensor([[0, 0, 20, 20], [0, 0, 0, 0], [10, 15, 30, 35], [23, 35, 93, 95]], dtype=torch.float)
-
+rotated_boxes = torch.tensor(
+    [
+        [100, 150, 150, 150, 150, 250, 100, 250],
+        [200, 350, 250, 350, 250, 250, 200, 250],
+        [300, 200, 200, 200, 200, 250, 300, 250],
+    ],
+    dtype=torch.float,
+)
 keypoints = torch.tensor([[[10, 10], [5, 5], [2, 2]], [[20, 20], [30, 30], [3, 3]]], dtype=torch.float)
 
 
@@ -148,6 +155,18 @@ def test_draw_boxes_with_coloured_label_backgrounds():
     assert_equal(result, expected)
 
 
+@pytest.mark.skipif(PILLOW_VERSION < (10, 1), reason="The reference image is only valid for PIL >= 10.1")
+def test_draw_rotatated_boxes():
+    img = torch.full((3, 500, 500), 255, dtype=torch.uint8)
+    colors = ["blue", "yellow", (0, 255, 0)]
+
+    result = utils.draw_bounding_boxes(img, rotated_boxes, colors=colors)
+    expected = torch.as_tensor(np.array(result)).permute(2, 0, 1)
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_rotated_boxes.png")
+    expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
+    assert_equal(result, expected)
+
+
 @pytest.mark.parametrize("fill", [True, False])
 def test_draw_boxes_dtypes(fill):
     img_uint8 = torch.full((3, 100, 100), 255, dtype=torch.uint8)
diff --git a/torchvision/ops/_box_convert.py b/torchvision/ops/_box_convert.py
@@ -130,56 +130,56 @@ def _box_xywhr_to_cxcywhr(boxes: Tensor) -> Tensor:
 
 def _box_xywhr_to_xyxyxyxy(boxes: Tensor) -> Tensor:
     """
-    Converts rotated bounding boxes from (x1, y1, w, h, r) format to (x1, y1, x3, y3, x2, y2, x4, y4) format.
+    Converts rotated bounding boxes from (x1, y1, w, h, r) format to (x1, y1, x2, y2, x3, y3, x4, y4) format.
     (x1, y1) refer to top left of bounding box
     (w, h) are width and height of the rotated bounding box
     r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
 
     (x1, y1) refer to top left of rotated bounding box
-    (x3, y3) refer to top right of rotated bounding box
-    (x2, y2) refer to bottom right of rotated bounding box
+    (x2, y2) refer to top right of rotated bounding box
+    (x3, y3) refer to bottom right of rotated bounding box
     (x4, y4) refer to bottom left ofrotated bounding box
     Args:
         boxes (Tensor[N, 5]): rotated boxes in (cx, cy, w, h, r) format which will be converted.
 
     Returns:
-        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x3, y3, x2, y2, x4, y4) format.
+        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x2, y2, x3, y3, x4, y4) format.
     """
     x1, y1, w, h, r = boxes.unbind(-1)
     r_rad = r * torch.pi / 180.0
     cos, sin = torch.cos(r_rad), torch.sin(r_rad)
 
-    x3 = x1 + w * cos
-    y3 = y1 - w * sin
-    x2 = x3 + h * sin
-    y2 = y3 + h * cos
+    x2 = x1 + w * cos
+    y2 = y1 - w * sin
+    x3 = x2 + h * sin
+    y3 = y2 + h * cos
     x4 = x1 + h * sin
     y4 = y1 + h * cos
 
-    return torch.stack((x1, y1, x3, y3, x2, y2, x4, y4), dim=-1)
+    return torch.stack((x1, y1, x2, y2, x3, y3, x4, y4), dim=-1)
 
 
 def _box_xyxyxyxy_to_xywhr(boxes: Tensor) -> Tensor:
     """
-    Converts rotated bounding boxes from (x1, y1, x3, y3, x2, y2, x4, y4) format to (x1, y1, w, h, r) format.
+    Converts rotated bounding boxes from (x1, y1, x2, y2, x3, y3, x4, y4) format to (x1, y1, w, h, r) format.
     (x1, y1) refer to top left of the rotated bounding box
-    (x3, y3) refer to bottom left of the rotated bounding box
-    (x2, y2) refer to bottom right of the rotated bounding box
+    (x2, y2) refer to bottom left of the rotated bounding box
+    (x3, y3) refer to bottom right of the rotated bounding box
     (x4, y4) refer to top right of the rotated bounding box
     (w, h) refers to width and height of rotated bounding box
     r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
 
     Args:
-        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x3, y3, x2, y2, x4, y4) format.
+        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x2, y2, x3, y3, x4, y4) format.
 
     Returns:
         boxes (Tensor[N, 5]): rotated boxes in (x1, y1, w, h, r) format.
     """
-    x1, y1, x3, y3, x2, y2, x4, y4 = boxes.unbind(-1)
-    r_rad = torch.atan2(y1 - y3, x3 - x1)
+    x1, y1, x2, y2, x3, y3, x4, y4 = boxes.unbind(-1)
+    r_rad = torch.atan2(y1 - y2, x2 - x1)
     r = r_rad * 180 / torch.pi
 
-    w = ((x3 - x1) ** 2 + (y1 - y3) ** 2).sqrt()
+    w = ((x2 - x1) ** 2 + (y1 - y2) ** 2).sqrt()
     h = ((x3 - x2) ** 2 + (y3 - y2) ** 2).sqrt()
 
     boxes = torch.stack((x1, y1, w, h, r), dim=-1)
diff --git a/torchvision/ops/boxes.py b/torchvision/ops/boxes.py
@@ -209,8 +209,8 @@ def box_convert(boxes: Tensor, in_fmt: str, out_fmt: str) -> Tensor:
     being width and height.
     r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
 
-    ``'xyxyxyxy'``: boxes are represented via corners, x1, y1 being top left, x2, y2 bottom right,
-    x3, y3 bottom left, and x4, y4 top right.
+    ``'xyxyxyxy'``: boxes are represented via corners, x1, y1 being top left, x2, y2 top right,
+    x3, y3 bottom right, and x4, y4 bottom left.
 
     Args:
         boxes (Tensor[N, K]): boxes which will be converted. K is the number of coordinates (4 for unrotated bounding boxes, 5 or 8 for rotated bounding boxes)
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
@@ -94,7 +94,7 @@ def horizontal_flip_bounding_boxes(
         dtype = bounding_boxes.dtype
         if not torch.is_floating_point(bounding_boxes):
             # Casting to float to support cos and sin computations.
-            bounding_boxes = bounding_boxes.to(torch.float64)
+            bounding_boxes = bounding_boxes.to(torch.float32)
         angle_rad = bounding_boxes[:, 4].mul(torch.pi).div(180)
         bounding_boxes[:, 0].add_(bounding_boxes[:, 2].mul(angle_rad.cos())).sub_(canvas_size[1]).neg_()
         bounding_boxes[:, 1].sub_(bounding_boxes[:, 2].mul(angle_rad.sin()))
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
@@ -227,13 +227,13 @@ def _xywhr_to_xyxyxyxy(xywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
     r_rad = xywhr[..., 4].mul(torch.pi).div(180.0)
     cos, sin = r_rad.cos(), r_rad.sin()
     xywhr = xywhr[..., :2].tile((1, 4))
-    # x1 + w * cos = x3
+    # x1 + w * cos = x2
     xywhr[..., 2].add_(wh[..., 0].mul(cos))
-    # y1 - w * sin = y3
+    # y1 - w * sin = y2
     xywhr[..., 3].sub_(wh[..., 0].mul(sin))
-    # x1 + w * cos + h * sin = x2
+    # x1 + w * cos + h * sin = x3
     xywhr[..., 4].add_(wh[..., 0].mul(cos).add(wh[..., 1].mul(sin)))
-    # y1 - w * sin + h * cos = y2
+    # y1 - w * sin + h * cos = y3
     xywhr[..., 5].sub_(wh[..., 0].mul(sin).sub(wh[..., 1].mul(cos)))
     # x1 + h * sin = x4
     xywhr[..., 6].add_(wh[..., 1].mul(sin))
@@ -252,12 +252,12 @@ def _xyxyxyxy_to_xywhr(xyxyxyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
         xyxyxyxy = xyxyxyxy.float()
 
     r_rad = torch.atan2(xyxyxyxy[..., 1].sub(xyxyxyxy[..., 3]), xyxyxyxy[..., 2].sub(xyxyxyxy[..., 0]))
-    # x1, y1, (x3 - x1), (y3 - y1), (x2 - x3), (y2 - y3) x4, y4
+    # x1, y1, (x2 - x1), (y2 - y1), (x3 - x2), (y3 - y2) x4, y4
     xyxyxyxy[..., 4:6].sub_(xyxyxyxy[..., 2:4])
     xyxyxyxy[..., 2:4].sub_(xyxyxyxy[..., :2])
-    # sqrt((x3 - x1) ** 2 + (y1 - y3) ** 2) = w
+    # sqrt((x2 - x1) ** 2 + (y1 - y2) ** 2) = w
     xyxyxyxy[..., 2] = xyxyxyxy[..., 2].pow(2).add(xyxyxyxy[..., 3].pow(2)).sqrt()
-    # sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2) = h
+    # sqrt((x2 - x3) ** 2 + (y2 - y3) ** 2) = h
     xyxyxyxy[..., 3] = xyxyxyxy[..., 4].pow(2).add(xyxyxyxy[..., 5].pow(2)).sqrt()
     xyxyxyxy[..., 4] = r_rad.div_(torch.pi).mul_(180.0)
     return xyxyxyxy[..., :5].to(dtype)
diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py
@@ -26,8 +26,8 @@ class BoundingBoxFormat(Enum):
       cy being center of box, w, h being width and height. r is rotation angle
       in degrees.
     * ``XYXYXYXY``: rotated boxes represented via corners, x1, y1 being top
-      left, x2, y2 being bottom right, x3, y3 being bottom left, x4, y4 being
-      top right.
+      left, x2, y2 being top right, x3, y3 being bottom right, x4, y4 being
+      bottom left.
     """
 
     XYXY = "XYXY"
diff --git a/torchvision/utils.py b/torchvision/utils.py