Add more rotated boxes docs (pytorch#9144)

NicolasHug · AntoineSimoulin · commit 69d12edb8e09 · 2025-07-09T14:03:57.000-07:00
diff --git a/gallery/transforms/plot_rotated_box_transforms.py b/gallery/transforms/plot_rotated_box_transforms.py
@@ -3,10 +3,13 @@
 Transforms on Rotated Bounding Boxes
 ===============================================================
 
-This example illustrates how to define and use rotated bounding boxes. We'll
-cover how to define them, demonstrate their usage with some of the existing
-transforms, and finally some of their unique behavior in comparision to
-standard bounding boxes.
+This example illustrates how to define and use rotated bounding boxes.
+
+.. note::
+    Support for rotated bounding boxes was released in TorchVision 0.23 and is
+    currently a BETA feature. We don't expect the API to change, but there may
+    be some rare edge-cases. If you find any issues, please report them on
+    our bug tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue
 
 First, a bit of setup code:
 """
@@ -18,7 +21,7 @@
 
 
 import torch
-from torchvision import tv_tensors
+from torchvision.tv_tensors import BoundingBoxes
 from torchvision.transforms import v2
 from helpers import plot
 
@@ -37,16 +40,16 @@
 # Creating a Rotated Bounding Box
 # -------------------------------
 # Rotated bounding boxes are created by instantiating the
-# :class:`~torchvision.tv_tensors.BoundingBoxes` class. It's the `format`
+# :class:`~torchvision.tv_tensors.BoundingBoxes` class. It's the ``format``
 # parameter of the constructor that determines if a bounding box is rotated or
-# not. In this instance, we use the
-# :attr:`~torchvision.tv_tensors.BoundingBoxFormat` kind `CXCYWHR`. The first
-# two values are the `x` and `y` coordinates of the center of the bounding box.
-# The next two values are the `width` and `height` of the bounding box, and the
-# last value is the `rotation` of the bounding box.
+# not. In this instance, we use the CXCYWHR
+# :attr:`~torchvision.tv_tensors.BoundingBoxFormat`. The first two values are
+# the X and Y coordinates of the center of the bounding box.  The next two
+# values are the width and height of the bounding box, and the last value is the
+# rotation of the bounding box, in degrees.
 
 
-orig_box = tv_tensors.BoundingBoxes(
+orig_box = BoundingBoxes(
     [
         [860.0, 1100, 570, 1840, -7],
     ],
@@ -57,100 +60,136 @@
 plot([(orig_img, orig_box)], bbox_width=10)
 
 # %%
-# Rotation
-# --------
-# Rotated bounding boxes maintain their rotation with respect to the image even
-# when the image itself is rotated through the
-# :class:`~torchvision.transforms.RandomRotation` transform.
+# Transforms illustrations
+# ------------------------
+#
+# Using :class:`~torchvision.transforms.RandomRotation`:
 rotater = v2.RandomRotation(degrees=(0, 180), expand=True)
 rotated_imgs = [rotater((orig_img, orig_box)) for _ in range(4)]
 plot([(orig_img, orig_box)] + rotated_imgs, bbox_width=10)
 
 # %%
-# Padding
-# -------
-# Rotated bounding boxes also maintain their properties when the image is padded using
-# :class:`~torchvision.transforms.Pad`.
+# Using :class:`~torchvision.transforms.Pad`:
 padded_imgs_and_boxes = [
     v2.Pad(padding=padding)(orig_img, orig_box)
     for padding in (30, 50, 100, 200)
 ]
 plot([(orig_img, orig_box)] + padded_imgs_and_boxes, bbox_width=10)
 
 # %%
-# Resizing
-# --------
-# Rotated bounding boxes are also resized along with an image in the
-# :class:`~torchvision.transforms.Resize` transform.
-#
-# Note that the bounding box looking bigger in the images with less pixels is
-# an artifact, not reality. That is merely the rasterised representation of the
-# bounding box's boundaries appearing bigger because we specify a fixed width of
-# that rasterized line. When the image is, say, only 30 pixels wide, a
-# line that is 3 pixels wide is relatively large.
+# Using :class:`~torchvision.transforms.Resize`:
 resized_imgs = [
     v2.Resize(size=size)(orig_img, orig_box)
     for size in (30, 50, 100, orig_img.size)
 ]
 plot([(orig_img, orig_box)] + resized_imgs, bbox_width=5)
 
 # %%
-# Perspective
-# -----------
-# The rotated bounding box is also transformed along with the image when the
-# perspective is transformed with :class:`~torchvision.transforms.RandomPerspective`.
-perspective_transformer = v2.RandomPerspective(distortion_scale=0.6, p=1.0)
-perspective_imgs = [perspective_transformer(orig_img, orig_box) for _ in range(4)]
-plot([(orig_img, orig_box)] + perspective_imgs, bbox_width=10)
-
-# %%
-# Elastic Transform
-# -----------------
-# The rotated bounding box is appropriately unchanged when going through the
-# :class:`~torchvision.transforms.ElasticTransform`.
-elastic_imgs = [
-    v2.ElasticTransform(alpha=alpha)(orig_img, orig_box)
-    for alpha in (100.0, 500.0, 1000.0, 2000.0)
-]
-plot([(orig_img, orig_box)] + elastic_imgs, bbox_width=10)
-
-# %%
-# Crop & Clamping Modes
-# ---------------------
-# The :class:`~torchvision.transforms.CenterCrop` transform selectively crops
-# the image on a center location. The behavior of the rotated bounding box
-# depends on its `clamping_mode`. We can set the `clamping_mode` in the
-# :class:`~torchvision.tv_tensors.BoundingBoxes` constructur, or by directly
-# setting it after construction as we do in the example below.
+# Note that the bounding box looking bigger in the images with less pixels is
+# an artifact, not reality. That is merely the rasterised representation of the
+# bounding box's boundaries appearing bigger because we specify a fixed width of
+# that rasterized line. When the image is, say, only 30 pixels wide, a
+# line that is 3 pixels wide is relatively large.
 #
-# There are two values for `clamping_mode`:
+# .. _clamping_mode_tuto:
 #
-#  - `"soft"`: The default when constucting
-#    :class:`~torchvision.tv_tensors.BoundingBoxes`. <Insert semantic
-#    description for soft mode.>
-#  - `"hard"`: <Insert semantic description for hard mode.>
+# Clamping Mode, and its effect on transforms
+# -------------------------------------------
 #
-# For standard bounding boxes, both modes behave the same. We also need to
-# document:
+# Some transforms, such as :class:`~torchvision.transforms.CenterCrop`, may
+# result in having the transformed bounding box partially outside of the
+# transformed (cropped) image. In general, this may happen on most of the
+# :ref:`geometric transforms <v2_api_ref>`.
 #
-#  - `clamping_mode` for individual kernels.
-#  - `clamping_mode` in :class:`~torchvision.transforms.v2.ClampBoundingBoxes`.
-#  - the new :class:`~torchvision.transforms.v2.SetClampingMode` transform.
+# In such cases, the bounding box is clamped to the transformed image size based
+# on its ``clamping_mode`` attribute.  There are three values for
+# ``clamping_mode``, which determines how the box is clamped after a
+# transformation:
 #
+#  - ``None``: No clamping is applied, and the bounding box may be partially
+#    outside of the image.
+#  - `"hard"`:  The box is clamped to the image size, such that all its corners
+#    are within the image canvas. This potentially results in a loss of
+#    information, and it can lead to unintuitive resuts. But may be necessary
+#    for some applications e.g. if the model doesn't support boxes outside of
+#    their image.
+#  - `"soft"`: . This is an intermediate mode between ``None`` and "hard": the
+#    box is clamped, but not as strictly as in "hard" mode. Some box dimensions
+#    may still be outside of the image. This is the default when constucting
+#    :class:`~torchvision.tv_tensors.BoundingBoxes`.
+#
+# .. note::
+#
+#       For axis-aligned bounding boxes, the `"soft"` and `"hard"` modes behave
+#       the same, as the bounding box is always clamped to the image size.
+#
+# Let's illustrate the clamping modes with
+# :class:`~torchvision.transforms.CenterCrop` transform:
+
 assert orig_box.clamping_mode == "soft"
-hard_box = orig_box.clone()
-hard_box.clamping_mode = "hard"
 
+box_hard_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode="hard")
+
+box_no_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode=None)
+
+crop_sizes = (800, 1200, 2000, orig_img.size)
 soft_center_crops_and_boxes = [
     v2.CenterCrop(size=size)(orig_img, orig_box)
-    for size in (800, 1200, 2000, orig_img.size)
+    for size in crop_sizes
 ]
 
 hard_center_crops_and_boxes = [
-    v2.CenterCrop(size=size)(orig_img, hard_box)
-    for size in (800, 1200, 2000, orig_img.size)
+    v2.CenterCrop(size=size)(orig_img, box_hard_clamping)
+    for size in crop_sizes
+]
+
+no_clamping_center_crops_and_boxes = [
+    v2.CenterCrop(size=size)(orig_img, box_no_clamping)
+    for size in crop_sizes
 ]
 
-plot([[(orig_img, orig_box)] + soft_center_crops_and_boxes,
-      [(orig_img, hard_box)] + hard_center_crops_and_boxes],
+plot([[(orig_img, box_hard_clamping)] + hard_center_crops_and_boxes,
+      [(orig_img, orig_box)] + soft_center_crops_and_boxes,
+      [(orig_img, box_no_clamping)] + no_clamping_center_crops_and_boxes],
      bbox_width=10)
+
+# %%
+# The plot above shows the "hard" clamping mode, "soft" and ``None``, in this
+# order. While "soft" and ``None`` result in similar plots, they do not lead to
+# the exact same clamped boxes. The non-clamped boxes will show dimensions that are further away from the image:
+print("boxes with soft clamping:")
+print(soft_center_crops_and_boxes)
+print()
+print("boxes with no clamping:")
+print(no_clamping_center_crops_and_boxes)
+
+# %%
+#
+# Setting the clamping mode
+# --------------------------
+#
+# The ``clamping_mode`` attribute, which determines the clamping strategy that
+# is applied to a box, can be set in different ways:
+#
+# - When constructing the bounding box with its
+#   :class:`~torchvision.tv_tensors.BoundingBoxes` constructor, as done in the example above.
+# - By directly setting the attribute on an existing instance, e.g. ``boxes.clamping_mode = "hard"``.
+# - By calling the :class:`~torchvision.transforms.v2.SetClampingMode` transform.
+#
+# Also, remember that you can always clamp the bounding box manually by
+# calling the :meth:`~torchvision.transforms.v2.ClampBoundingBoxes` transform!
+# Here's an example illustrating all of these option:
+
+t = v2.Compose([
+    v2.CenterCrop(size=(800,)),  # clamps according to the current clamping_mode
+                                 # attribute, in this case set by the constructor
+    v2.SetClampingMode(None),  # sets the clamping_mode attribute for future transforms
+    v2.Pad(padding=3),  # clamps according to the current clamping_mode
+                        # i.e. ``None``
+    v2.ClampBoundingBoxes(clamping_mode="soft"),  # clamps with "soft" mode.
+])
+
+out_img, out_box = t(orig_img, orig_box)
+plot([(orig_img, orig_box), (out_img, out_box)], bbox_width=10)
+
+# %%
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
@@ -27,11 +27,10 @@ def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> t
 class ClampBoundingBoxes(Transform):
     """Clamp bounding boxes to their corresponding image dimensions.
 
-    The clamping is done according to the bounding boxes' ``canvas_size`` meta-data.
-
     Args:
-        clamping_mode: TODOBB more docs. Default is None which relies on the input box' clamping_mode attribute.
-
+        clamping_mode: Default is "auto" which relies on the input box'
+            ``clamping_mode`` attribute. Read more in :ref:`clamping_mode_tuto`
+            for more details on how to use this transform.
     """
 
     def __init__(self, clamping_mode: Union[CLAMPING_MODE_TYPE, str] = "auto") -> None:
@@ -57,7 +56,15 @@ def transform(self, inpt: tv_tensors.KeyPoints, params: dict[str, Any]) -> tv_te
 
 
 class SetClampingMode(Transform):
-    """TODOBB"""
+    """Sets the ``clamping_mode`` attribute of the bounding boxes for future transforms.
+
+
+
+    Args:
+        clamping_mode: The clamping mode to set. Possible values are: "soft",
+            "hard", or ``None``. Read more in :ref:`clamping_mode_tuto` for more
+            details on how to use this transform.
+    """
 
     def __init__(self, clamping_mode: CLAMPING_MODE_TYPE) -> None:
         super().__init__()
diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py
@@ -59,12 +59,17 @@ def is_rotated_bounding_format(format: BoundingBoxFormat | str) -> bool:
 # This should ideally be a Literal, but torchscript fails.
 CLAMPING_MODE_TYPE = Optional[str]
 
-# TODOBB All docs. Add any new API to rst files, add tutorial[s].
-
 
 class BoundingBoxes(TVTensor):
     """:class:`torch.Tensor` subclass for bounding boxes with shape ``[N, K]``.
 
+    .. note::
+        Support for rotated bounding boxes was released in TorchVision 0.23 and
+        is currently a BETA feature. We don't expect the API to change, but
+        there may be some rare edge-cases. If you find any issues, please report
+        them on our bug tracker:
+        https://github.com/pytorch/vision/issues?q=is:open+is:issue
+
     Where ``N`` is the number of bounding boxes
     and ``K`` is 4 for unrotated boxes, and 5 or 8 for rotated boxes.
 
@@ -78,7 +83,8 @@ class BoundingBoxes(TVTensor):
         data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
         format (BoundingBoxFormat, str): Format of the bounding box.
         canvas_size (two-tuple of ints): Height and width of the corresponding image or video.
-        clamping_mode: TODOBB
+        clamping_mode: The clamping mode to use when applying transforms that may result in bounding boxes
+            partially outside of the image. Possible values are: "soft", "hard", or ``None``. Read more in :ref:`clamping_mode_tuto`.
         dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
             ``data``.
         device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
diff --git a/torchvision/tv_tensors/_keypoints.py b/torchvision/tv_tensors/_keypoints.py
@@ -11,6 +11,14 @@
 class KeyPoints(TVTensor):
     """:class:`torch.Tensor` subclass for tensors with shape ``[..., 2]`` that represent points in an image.
 
+    .. note::
+        Support for keypoints was released in TorchVision 0.23 and is currently
+        a BETA feature. We don't expect the API to change, but there may be some
+        rare edge-cases. If you find any issues, please report them on our bug
+        tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue
+        Each point is represented by its X and Y coordinates along the width and
+        height dimensions, respectively.
+
     Each point is represented by its X and Y coordinates along the width and height dimensions, respectively.
 
     KeyPoints may represent any object that can be represented by sequences of 2D points: