Fix cuda tests

AntoineSimoulin · AntoineSimoulin · commit 4050cb93402f · 2025-06-17T13:09:16.000-07:00
Test Plan:
```bash
pytest test/test_transforms_v2.py -k box -v
```
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
@@ -608,22 +608,19 @@ def affine_rotated_bounding_boxes(bounding_boxes):
         )
 
         if torch.is_floating_point(output) and int_dtype:
-            # it is better to round before cast
+            # It is important to round before cast.
             output = torch.round(output)
 
-        if clamp:
-            # It is important to clamp before casting, especially for CXCYWHR format, dtype=int64
-            output = F.clamp_bounding_boxes(
-                output,
+        # For rotated boxes, it is important to cast before clamping.
+        return (
+            F.clamp_bounding_boxes(
+                output.to(dtype=dtype, device=device),
                 format=format,
                 canvas_size=canvas_size,
             )
-        else:
-            # We leave the bounding box as float32 so the caller gets the full precision to perform any additional
-            # operation
-            dtype = output.dtype
-
-        return output.to(dtype=dtype, device=device)
+            if clamp
+            else output.to(dtype=output.dtype, device=device)
+        )
 
     return tv_tensors.BoundingBoxes(
         torch.cat(
diff --git a/torchvision/ops/_box_convert.py b/torchvision/ops/_box_convert.py
@@ -94,6 +94,8 @@ def _box_cxcywhr_to_xywhr(boxes: Tensor) -> Tensor:
     Returns:
         boxes (Tensor(N, 5)): rotated boxes in (x1, y1, w, h, r) format.
     """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
     cx, cy, w, h, r = boxes.unbind(-1)
     r_rad = r * torch.pi / 180.0
     cos, sin = torch.cos(r_rad), torch.sin(r_rad)
@@ -102,6 +104,9 @@ def _box_cxcywhr_to_xywhr(boxes: Tensor) -> Tensor:
     y1 = cy - h / 2 * cos + w / 2 * sin
     boxes = torch.stack((x1, y1, w, h, r), dim=-1)
 
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
     return boxes
 
 
@@ -117,6 +122,8 @@ def _box_xywhr_to_cxcywhr(boxes: Tensor) -> Tensor:
     Returns:
         boxes (Tensor[N, 5]): rotated boxes in (cx, cy, w, h, r) format.
     """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
     x1, y1, w, h, r = boxes.unbind(-1)
     r_rad = r * torch.pi / 180.0
     cos, sin = torch.cos(r_rad), torch.sin(r_rad)
@@ -125,6 +132,9 @@ def _box_xywhr_to_cxcywhr(boxes: Tensor) -> Tensor:
     cy = y1 - w / 2 * sin + h / 2 * cos
 
     boxes = torch.stack([cx, cy, w, h, r], dim=-1)
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
     return boxes
 
 
@@ -145,6 +155,8 @@ def _box_xywhr_to_xyxyxyxy(boxes: Tensor) -> Tensor:
     Returns:
         boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x2, y2, x3, y3, x4, y4) format.
     """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
     x1, y1, w, h, r = boxes.unbind(-1)
     r_rad = r * torch.pi / 180.0
     cos, sin = torch.cos(r_rad), torch.sin(r_rad)
@@ -156,7 +168,11 @@ def _box_xywhr_to_xyxyxyxy(boxes: Tensor) -> Tensor:
     x4 = x1 + h * sin
     y4 = y1 + h * cos
 
-    return torch.stack((x1, y1, x2, y2, x3, y3, x4, y4), dim=-1)
+    boxes = torch.stack((x1, y1, x2, y2, x3, y3, x4, y4), dim=-1)
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
+    return boxes
 
 
 def _box_xyxyxyxy_to_xywhr(boxes: Tensor) -> Tensor:
@@ -175,6 +191,8 @@ def _box_xyxyxyxy_to_xywhr(boxes: Tensor) -> Tensor:
     Returns:
         boxes (Tensor[N, 5]): rotated boxes in (x1, y1, w, h, r) format.
     """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
     x1, y1, x2, y2, x3, y3, x4, y4 = boxes.unbind(-1)
     r_rad = torch.atan2(y1 - y2, x2 - x1)
     r = r_rad * 180 / torch.pi
@@ -183,5 +201,7 @@ def _box_xyxyxyxy_to_xywhr(boxes: Tensor) -> Tensor:
     h = ((x3 - x2) ** 2 + (y3 - y2) ** 2).sqrt()
 
     boxes = torch.stack((x1, y1, w, h, r), dim=-1)
-
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
     return boxes
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
@@ -924,9 +924,9 @@ def _affine_bounding_boxes_with_expand(
         return bounding_boxes, canvas_size
 
     original_shape = bounding_boxes.shape
-    original_dtype = bounding_boxes.dtype
-    bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
     dtype = bounding_boxes.dtype
+    need_cast = not bounding_boxes.is_floating_point()
+    bounding_boxes = bounding_boxes.float() if need_cast else bounding_boxes.clone()
     device = bounding_boxes.device
     is_rotated = tv_tensors.is_rotated_bounding_format(format)
     intermediate_format = tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY
@@ -947,7 +947,7 @@ def _affine_bounding_boxes_with_expand(
     transposed_affine_matrix = (
         torch.tensor(
             affine_vector,
-            dtype=dtype,
+            dtype=bounding_boxes.dtype,
             device=device,
         )
         .reshape(2, 3)
@@ -961,7 +961,7 @@ def _affine_bounding_boxes_with_expand(
         points = bounding_boxes.reshape(-1, 2)
     else:
         points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
-    points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=dtype)], dim=-1)
+    points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=bounding_boxes.dtype)], dim=-1)
     # 2) Now let's transform the points using affine matrix
     transformed_points = torch.matmul(points, transposed_affine_matrix)
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
@@ -985,7 +985,7 @@ def _affine_bounding_boxes_with_expand(
                 [float(width), float(height), 1.0],
                 [float(width), 0.0, 1.0],
             ],
-            dtype=dtype,
+            dtype=bounding_boxes.dtype,
             device=device,
         )
         new_points = torch.matmul(points, transposed_affine_matrix)
@@ -1002,7 +1002,10 @@ def _affine_bounding_boxes_with_expand(
         out_bboxes, old_format=intermediate_format, new_format=format, inplace=True
     ).reshape(original_shape)
 
-    out_bboxes = out_bboxes.to(original_dtype)
+    if need_cast:
+        if dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
+            out_bboxes.round_()
+        out_bboxes = out_bboxes.to(dtype)
     return out_bboxes, canvas_size
 
 
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
@@ -181,7 +181,8 @@ def _cxcywhr_to_xywhr(cxcywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
         cxcywhr = cxcywhr.clone()
 
     dtype = cxcywhr.dtype
-    if not cxcywhr.is_floating_point():
+    need_cast = not cxcywhr.is_floating_point()
+    if need_cast:
         cxcywhr = cxcywhr.float()
 
     half_wh = cxcywhr[..., 2:-1].div(-2, rounding_mode=None if cxcywhr.is_floating_point() else "floor").abs_()
@@ -192,15 +193,20 @@ def _cxcywhr_to_xywhr(cxcywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
     # (cy + width / 2 * sin - height / 2 * cos) = y1
     cxcywhr[..., 1].add_(half_wh[..., 0].mul(sin)).sub_(half_wh[..., 1].mul(cos))
 
-    return cxcywhr.to(dtype)
+    if need_cast:
+        cxcywhr.round_()
+        cxcywhr = cxcywhr.to(dtype)
+
+    return cxcywhr
 
 
 def _xywhr_to_cxcywhr(xywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
     if not inplace:
         xywhr = xywhr.clone()
 
     dtype = xywhr.dtype
-    if not xywhr.is_floating_point():
+    need_cast = not xywhr.is_floating_point()
+    if need_cast:
         xywhr = xywhr.float()
 
     half_wh = xywhr[..., 2:-1].div(-2, rounding_mode=None if xywhr.is_floating_point() else "floor").abs_()
@@ -211,7 +217,11 @@ def _xywhr_to_cxcywhr(xywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
     # (y1 - width / 2 * sin + height / 2 * cos) = cy
     xywhr[..., 1].sub_(half_wh[..., 0].mul(sin)).add_(half_wh[..., 1].mul(cos))
 
-    return xywhr.to(dtype)
+    if need_cast:
+        xywhr.round_()
+        xywhr = xywhr.to(dtype)
+
+    return xywhr
 
 
 def _xywhr_to_xyxyxyxy(xywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
@@ -220,7 +230,8 @@ def _xywhr_to_xyxyxyxy(xywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
         xywhr = xywhr.clone()
 
     dtype = xywhr.dtype
-    if not xywhr.is_floating_point():
+    need_cast = not xywhr.is_floating_point()
+    if need_cast:
         xywhr = xywhr.float()
 
     wh = xywhr[..., 2:-1]
@@ -239,7 +250,12 @@ def _xywhr_to_xyxyxyxy(xywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
     xywhr[..., 6].add_(wh[..., 1].mul(sin))
     # y1 + h * cos = y4
     xywhr[..., 7].add_(wh[..., 1].mul(cos))
-    return xywhr.to(dtype)
+
+    if need_cast:
+        xywhr.round_()
+        xywhr = xywhr.to(dtype)
+
+    return xywhr
 
 
 def _xyxyxyxy_to_xywhr(xyxyxyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
@@ -248,7 +264,8 @@ def _xyxyxyxy_to_xywhr(xyxyxyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
         xyxyxyxy = xyxyxyxy.clone()
 
     dtype = xyxyxyxy.dtype
-    if not xyxyxyxy.is_floating_point():
+    need_cast = not xyxyxyxy.is_floating_point()
+    if need_cast:
         xyxyxyxy = xyxyxyxy.float()
 
     r_rad = torch.atan2(xyxyxyxy[..., 1].sub(xyxyxyxy[..., 3]), xyxyxyxy[..., 2].sub(xyxyxyxy[..., 0]))
@@ -260,7 +277,12 @@ def _xyxyxyxy_to_xywhr(xyxyxyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
     # sqrt((x2 - x3) ** 2 + (y2 - y3) ** 2) = h
     xyxyxyxy[..., 3] = xyxyxyxy[..., 4].pow(2).add(xyxyxyxy[..., 5].pow(2)).sqrt()
     xyxyxyxy[..., 4] = r_rad.div_(torch.pi).mul_(180.0)
-    return xyxyxyxy[..., :5].to(dtype)
+
+    if need_cast:
+        xyxyxyxy.round_()
+        xyxyxyxy = xyxyxyxy.to(dtype)
+
+    return xyxyxyxy[..., :5]
 
 
 def _convert_bounding_box_format(
@@ -423,14 +445,14 @@ def _clamp_along_y_axis(
     case_d = torch.zeros_like(case_c)
     case_e = torch.cat([x.unsqueeze(1) for x in [x1.clamp(0), y1, x2.clamp(0), y2, x3, y3, x4, y4]], dim=1)
 
-    cond_a = x1.lt(0).logical_and(x2.ge(0)).logical_and(x3.ge(0)).logical_and(x4.ge(0))
+    cond_a = (x1 < 0).logical_and(x2 >= 0).logical_and(x3 >= 0).logical_and(x4 >= 0)
     cond_a = cond_a.logical_and(_area(case_a) > _area(case_b))
-    cond_a = cond_a.logical_or(x1.lt(0).logical_and(x2.ge(0)).logical_and(x3.ge(0)).logical_and(x4.le(0)))
-    cond_b = x1.lt(0).logical_and(x2.ge(0)).logical_and(x3.ge(0)).logical_and(x4.ge(0))
+    cond_a = cond_a.logical_or((x1 < 0).logical_and(x2 >= 0).logical_and(x3 >= 0).logical_and(x4 <= 0))
+    cond_b = (x1 < 0).logical_and(x2 >= 0).logical_and(x3 >= 0).logical_and(x4 >= 0)
     cond_b = cond_b.logical_and(_area(case_a) <= _area(case_b))
-    cond_b = cond_b.logical_or(x1.lt(0).logical_and(x2.le(0)).logical_and(x3.ge(0)).logical_and(x4.ge(0)))
-    cond_c = x1.lt(0).logical_and(x2.le(0)).logical_and(x3.ge(0)).logical_and(x4.le(0))
-    cond_d = x1.lt(0).logical_and(x2.le(0)).logical_and(x3.le(0)).logical_and(x4.le(0))
+    cond_b = cond_b.logical_or((x1 < 0).logical_and(x2 <= 0).logical_and(x3 >= 0).logical_and(x4 >= 0))
+    cond_c = (x1 < 0).logical_and(x2 <= 0).logical_and(x3 >= 0).logical_and(x4 <= 0)
+    cond_d = (x1 < 0).logical_and(x2 <= 0).logical_and(x3 <= 0).logical_and(x4 <= 0)
     cond_e = x1.isclose(x2)
 
     for cond, case in zip(
@@ -465,15 +487,17 @@ def _clamp_rotated_bounding_boxes(
         torch.Tensor: Clamped bounding boxes in the original format and shape
     """
     original_shape = bounding_boxes.shape
-    original_dtype = bounding_boxes.dtype
-    bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
+    dtype = bounding_boxes.dtype
+    acceptable_dtypes = [torch.float64]  # Ensure consistency between CPU and GPU.
+    need_cast = dtype not in acceptable_dtypes
+    bounding_boxes = bounding_boxes.to(torch.float64) if need_cast else bounding_boxes.clone()
     out_boxes = (
         convert_bounding_box_format(
             bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, inplace=True
         )
     ).reshape(-1, 8)
 
-    for _ in range(4):
+    for _ in range(4):  # Iterate over the 4 vertices.
         indices, out_boxes = _order_bounding_boxes_points(out_boxes)
         out_boxes = _clamp_along_y_axis(out_boxes)
         _, out_boxes = _order_bounding_boxes_points(out_boxes, indices)
@@ -488,7 +512,10 @@ def _clamp_rotated_bounding_boxes(
         out_boxes, old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
-    out_boxes = out_boxes.to(original_dtype)
+    if need_cast:
+        if dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
+            out_boxes.round_()
+        out_boxes = out_boxes.to(dtype)
     return out_boxes