Add clamping_mode parameter to clamp_bounding_boxes functional and class

NicolasHug · NicolasHug · commit da7f360b125c · 2025-06-27T13:28:34.000+01:00
diff --git a/test/common_utils.py b/test/common_utils.py
@@ -410,6 +410,7 @@ def make_bounding_boxes(
     canvas_size=DEFAULT_SIZE,
     *,
     format=tv_tensors.BoundingBoxFormat.XYXY,
+    clamping_mode="soft",
     num_boxes=1,
     dtype=None,
     device="cpu",
@@ -474,7 +475,7 @@ def sample_position(values, max_value):
         # numerical issues during the testing
         buffer = 4
         out_boxes = clamp_bounding_boxes(
-            out_boxes, format=format, canvas_size=(canvas_size[0] - buffer, canvas_size[1] - buffer)
+            out_boxes, format=format, canvas_size=(canvas_size[0] - buffer, canvas_size[1] - buffer), clamping_mode=clamping_mode
         )
         if format is tv_tensors.BoundingBoxFormat.XYWHR or format is tv_tensors.BoundingBoxFormat.CXCYWHR:
             out_boxes[:, :2] += buffer // 2
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
@@ -5506,20 +5506,23 @@ def test_correctness_image(self, mean, std, dtype, fn):
 
 class TestClampBoundingBoxes:
     @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("clamping_mode", ("hard", "none"))  # TODOBB add soft
     @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel(self, format, dtype, device):
-        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+    def test_kernel(self, format, clamping_mode, dtype, device):
+        bounding_boxes = make_bounding_boxes(format=format, clamping_mode=clamping_mode, dtype=dtype, device=device)
         check_kernel(
             F.clamp_bounding_boxes,
             bounding_boxes,
             format=bounding_boxes.format,
             canvas_size=bounding_boxes.canvas_size,
+            clamping_mode=clamping_mode,
         )
 
     @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
-    def test_functional(self, format):
-        check_functional(F.clamp_bounding_boxes, make_bounding_boxes(format=format))
+    @pytest.mark.parametrize("clamping_mode", ("hard", "none"))   # TODOBB add soft
+    def test_functional(self, format, clamping_mode):
+        check_functional(F.clamp_bounding_boxes, make_bounding_boxes(format=format, clamping_mode=clamping_mode))
 
     def test_errors(self):
         input_tv_tensor = make_bounding_boxes()
@@ -5540,6 +5543,47 @@ def test_errors(self):
 
     def test_transform(self):
         check_transform(transforms.ClampBoundingBoxes(), make_bounding_boxes())
+    
+    @pytest.mark.parametrize("rotated", (True, False))
+    @pytest.mark.parametrize("constructor_clamping_mode", ("hard", "none"))
+    @pytest.mark.parametrize("clamping_mode", ("hard", "none", None))  # TODOBB add soft here.
+    @pytest.mark.parametrize("pass_pure_tensor", (True, False))
+    @pytest.mark.parametrize("fn", [F.clamp_bounding_boxes, transform_cls_to_functional(transforms.ClampBoundingBoxes)])
+    def test_clamping_mode(self, rotated, constructor_clamping_mode, clamping_mode, pass_pure_tensor, fn):
+        # This test checks 2 things:
+        # - That passing clamping_mode=None to the clamp_bounding_boxes
+        #   functional (or to the class) relies on the box's `.clamping_mode`
+        #   attribute
+        # - That clamping happens when it should, and only when it should, i.e.
+        #   when the clamping mode is not "none". It doesn't validate the
+        #   nunmerical results, only that clamping happened. For that, we create
+        #   a large 100x100 box inside of a small 10x10 image.
+
+        if pass_pure_tensor and fn is not F.clamp_bounding_boxes:
+            # Only the functional supports pure tensors, not the class
+            return
+        if pass_pure_tensor and clamping_mode is None:
+            # cannot leave clamping_mode=None when passing pure tensor
+            return
+
+        if rotated:
+            boxes = tv_tensors.BoundingBoxes([0, 0, 100, 100, 0], format="XYWHR", canvas_size=(10, 10), clamping_mode=constructor_clamping_mode)
+            expected_clamped_output = torch.tensor([[0, 0, 10, 10, 0]])
+        else:
+            boxes = tv_tensors.BoundingBoxes([0, 100, 0, 100], format="XYXY", canvas_size=(10, 10), clamping_mode=constructor_clamping_mode)
+            expected_clamped_output = torch.tensor([[0, 10, 0, 10]])
+
+        if pass_pure_tensor:
+            out = fn(boxes.as_subclass(torch.Tensor), format=boxes.format, canvas_size=boxes.canvas_size, clamping_mode=clamping_mode)
+        else:
+            out = fn(boxes, clamping_mode=clamping_mode)
+
+        clamping_mode_prevailing = constructor_clamping_mode if clamping_mode is None else clamping_mode
+        if clamping_mode_prevailing == "none":
+            assert_equal(boxes, out)  # should be a pass-through
+        else:
+            assert_equal(out, expected_clamped_output)
+
 
 
 class TestClampKeyPoints:
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
@@ -2,6 +2,7 @@
 
 from torchvision import tv_tensors
 from torchvision.transforms.v2 import functional as F, Transform
+from torchvision.tv_tensors._bounding_boxes import CLAMPING_MODE_TYPE 
 
 
 class ConvertBoundingBoxFormat(Transform):
@@ -28,12 +29,18 @@ class ClampBoundingBoxes(Transform):
 
     The clamping is done according to the bounding boxes' ``canvas_size`` meta-data.
 
+    Args:
+        clamping_mode: TODOBB more docs. Default is None which relies on the input box' .clamping_mode attribute.
+
     """
+    def __init__(self, clamping_mode: CLAMPING_MODE_TYPE = None) -> None:
+        super().__init__()
+        self.clamping_mode = clamping_mode
 
     _transformed_types = (tv_tensors.BoundingBoxes,)
 
     def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> tv_tensors.BoundingBoxes:
-        return F.clamp_bounding_boxes(inpt)  # type: ignore[return-value]
+        return F.clamp_bounding_boxes(inpt, clamping_mode=self.clamping_mode)  # type: ignore[return-value]
 
 
 class ClampKeyPoints(Transform):
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
@@ -5,6 +5,7 @@
 from torchvision import tv_tensors
 from torchvision.transforms import _functional_pil as _FP
 from torchvision.tv_tensors import BoundingBoxFormat
+from torchvision.tv_tensors._bounding_boxes import CLAMPING_MODE_TYPE 
 
 from torchvision.utils import _log_api_usage_once
 
@@ -370,8 +371,11 @@ def convert_bounding_box_format(
 
 
 def _clamp_bounding_boxes(
-    bounding_boxes: torch.Tensor, format: BoundingBoxFormat, canvas_size: tuple[int, int]
+    bounding_boxes: torch.Tensor, format: BoundingBoxFormat, canvas_size: tuple[int, int],
+    clamping_mode: Optional[CLAMPING_MODE_TYPE],  # TODOBB shouldn't be Optional
 ) -> torch.Tensor:
+    if clamping_mode is not None and clamping_mode == "none":
+        return bounding_boxes.clone()
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
     in_dtype = bounding_boxes.dtype
@@ -477,7 +481,8 @@ def _clamp_along_y_axis(
 
 
 def _clamp_rotated_bounding_boxes(
-    bounding_boxes: torch.Tensor, format: BoundingBoxFormat, canvas_size: tuple[int, int]
+    bounding_boxes: torch.Tensor, format: BoundingBoxFormat, canvas_size: tuple[int, int],
+    clamping_mode: Optional[CLAMPING_MODE_TYPE],  # TODOBB shouldn't be Optional
 ) -> torch.Tensor:
     """
     Clamp rotated bounding boxes to ensure they stay within the canvas boundaries.
@@ -499,6 +504,8 @@ def _clamp_rotated_bounding_boxes(
     Returns:
         torch.Tensor: Clamped bounding boxes in the original format and shape
     """
+    if clamping_mode is not None and clamping_mode == "none":
+        return bounding_boxes.clone()
     original_shape = bounding_boxes.shape
     dtype = bounding_boxes.dtype
     acceptable_dtypes = [torch.float64]  # Ensure consistency between CPU and GPU.
@@ -536,29 +543,33 @@ def clamp_bounding_boxes(
     inpt: torch.Tensor,
     format: Optional[BoundingBoxFormat] = None,
     canvas_size: Optional[tuple[int, int]] = None,
+    clamping_mode: Optional[CLAMPING_MODE_TYPE] = None,
 ) -> torch.Tensor:
     """See :func:`~torchvision.transforms.v2.ClampBoundingBoxes` for details."""
     if not torch.jit.is_scripting():
         _log_api_usage_once(clamp_bounding_boxes)
 
     if torch.jit.is_scripting() or is_pure_tensor(inpt):
 
-        if format is None or canvas_size is None:
-            raise ValueError("For pure tensor inputs, `format` and `canvas_size` have to be passed.")
+        # TODOBB
+        if format is None or canvas_size is None or clamping_mode is None:
+            raise ValueError("For pure tensor inputs, `format`, `canvas_size` and `clamping_mode` have to be passed.")
         if tv_tensors.is_rotated_bounding_format(format):
-            return _clamp_rotated_bounding_boxes(inpt, format=format, canvas_size=canvas_size)
+            return _clamp_rotated_bounding_boxes(inpt, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode)
         else:
-            return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size)
+            return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode)
     elif isinstance(inpt, tv_tensors.BoundingBoxes):
         if format is not None or canvas_size is not None:
             raise ValueError("For bounding box tv_tensor inputs, `format` and `canvas_size` must not be passed.")
+        if clamping_mode is None:
+            clamping_mode = inpt.clamping_mode
         if tv_tensors.is_rotated_bounding_format(inpt.format):
             output = _clamp_rotated_bounding_boxes(
-                inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
+                inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, clamping_mode=clamping_mode
             )
         else:
             output = _clamp_bounding_boxes(
-                inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
+                inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, clamping_mode=clamping_mode
             )
         return tv_tensors.wrap(output, like=inpt)
     else:
diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py
@@ -3,7 +3,7 @@
 from collections.abc import Mapping, Sequence
 
 from enum import Enum
-from typing import Any, Literal
+from typing import Any
 
 import torch
 from torch.utils._pytree import tree_flatten
@@ -46,7 +46,12 @@ def is_rotated_bounding_format(format: BoundingBoxFormat) -> bool:
     )
 
 
-CLAMPING_MODE_TYPE = Literal["hard", "soft", "none"]
+# TODOBB consider making this a Literal instead. Tried briefly and got
+# torchscript errors, leaving to str for now.
+# CLAMPING_MODE_TYPE = Literal["hard", "soft", "none"]
+CLAMPING_MODE_TYPE = str 
+
+# TODOBB All docs. Add any new API to rst files, add tutorial[s].
 
 
 class BoundingBoxes(TVTensor):
@@ -65,6 +70,7 @@ class BoundingBoxes(TVTensor):
         data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
         format (BoundingBoxFormat, str): Format of the bounding box.
         canvas_size (two-tuple of ints): Height and width of the corresponding image or video.
+        clamping_mode: TODOBB
         dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
             ``data``.
         device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
@@ -89,6 +95,7 @@ def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat | str, canvas_
         bounding_boxes = tensor.as_subclass(cls)
         bounding_boxes.format = format
         bounding_boxes.canvas_size = canvas_size
+        # TODOBB validate values
         bounding_boxes.clamping_mode = clamping_mode
         return bounding_boxes
 
@@ -98,13 +105,13 @@ def __new__(
         *,
         format: BoundingBoxFormat | str,
         canvas_size: tuple[int, int],
-        clamping_mode: CLAMPING_MODE_TYPE = "soft",
+        clamping_mode: CLAMPING_MODE_TYPE = "hard",  # TODOBB change default to soft!
         dtype: torch.dtype | None = None,
         device: torch.device | str | int | None = None,
         requires_grad: bool | None = None,
     ) -> BoundingBoxes:
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
-        return cls._wrap(tensor, format=format, canvas_size=canvas_size)
+        return cls._wrap(tensor, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode)
 
     @classmethod
     def _wrap_output(