[release/0.23] Documentation cherry picks (pytorch#9145)

AntoineSimoulin · scotts · NicolasHug · web-flow · commit 9a8003e3c9f7 · 2025-07-10T09:32:42.000+01:00
Co-authored-by: Scott Schneider &lt;scott.a.s@gmail.com&gt;
Co-authored-by: Nicolas Hug &lt;nh.nicolas.hug@gmail.com&gt;
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -87,6 +87,7 @@ def __init__(self, src_dir):
         "plot_transforms_illustrations.py",
         "plot_transforms_e2e.py",
         "plot_cutmix_mixup.py",
+        "plot_rotated_box_transforms.py",
         "plot_custom_transforms.py",
         "plot_tv_tensors.py",
         "plot_custom_tv_tensors.py",
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
@@ -1,14 +1,20 @@
 .. _transforms:
 
-Transforming and augmenting images
-==================================
+Transforming images, videos, boxes and more
+===========================================
 
 .. currentmodule:: torchvision.transforms
 
 Torchvision supports common computer vision transformations in the
-``torchvision.transforms`` and ``torchvision.transforms.v2`` modules. Transforms
-can be used to transform or augment data for training or inference of different
-tasks (image classification, detection, segmentation, video classification).
+``torchvision.transforms.v2`` module. Transforms can be used to transform and
+augment data, for both training or inference. The following objects are
+supported:
+
+- Images as pure tensors, :class:`~torchvision.tv_tensors.Image` or PIL image
+- Videos as :class:`~torchvision.tv_tensors.Video` 
+- Axis-aligned and rotated bounding boxes as :class:`~torchvision.tv_tensors.BoundingBoxes` 
+- Segmentation and detection masks as :class:`~torchvision.tv_tensors.Mask` 
+- KeyPoints as :class:`~torchvision.tv_tensors.KeyPoints`.
 
 .. code:: python
 
@@ -111,9 +117,9 @@ In Torchvision 0.15 (March 2023), we released a new set of transforms available
 in the ``torchvision.transforms.v2`` namespace. These transforms have a lot of
 advantages compared to the v1 ones (in ``torchvision.transforms``):
 
-- They can transform images **but also** bounding boxes, masks, or videos. This
-  provides support for tasks beyond image classification: detection, segmentation,
-  video classification, etc. See
+- They can transform images **and also** bounding boxes, masks, videos and
+  keypoints. This provides support for tasks beyond image classification:
+  detection, segmentation, video classification, pose estimation, etc. See
   :ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py`
   and :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`.
 - They support more transforms like :class:`~torchvision.transforms.v2.CutMix`
diff --git a/gallery/assets/leaning_tower.jpg b/gallery/assets/leaning_tower.jpg
diff --git a/gallery/transforms/helpers.py b/gallery/transforms/helpers.py
@@ -2,10 +2,11 @@
 import torch
 from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
 from torchvision import tv_tensors
+from torchvision.transforms import v2
 from torchvision.transforms.v2 import functional as F
 
 
-def plot(imgs, row_title=None, **imshow_kwargs):
+def plot(imgs, row_title=None, bbox_width=3, **imshow_kwargs):
     if not isinstance(imgs[0], list):
         # Make a 2d grid even if there's just 1 row
         imgs = [imgs]
@@ -24,6 +25,11 @@ def plot(imgs, row_title=None, **imshow_kwargs):
                     masks = target.get("masks")
                 elif isinstance(target, tv_tensors.BoundingBoxes):
                     boxes = target
+
+                    # Conversion necessary because draw_bounding_boxes() only
+                    # work with this specific format.
+                    if tv_tensors.is_rotated_bounding_format(boxes.format):
+                        boxes = v2.ConvertBoundingBoxFormat("xyxyxyxy")(boxes)
                 else:
                     raise ValueError(f"Unexpected target type: {type(target)}")
             img = F.to_image(img)
@@ -35,7 +41,7 @@ def plot(imgs, row_title=None, **imshow_kwargs):
 
             img = F.to_dtype(img, torch.uint8, scale=True)
             if boxes is not None:
-                img = draw_bounding_boxes(img, boxes, colors="yellow", width=3)
+                img = draw_bounding_boxes(img, boxes, colors="yellow", width=bbox_width)
             if masks is not None:
                 img = draw_segmentation_masks(img, masks.to(torch.bool), colors=["green"] * masks.shape[0], alpha=.65)
 
diff --git a/gallery/transforms/plot_rotated_box_transforms.py b/gallery/transforms/plot_rotated_box_transforms.py
@@ -0,0 +1,195 @@
+"""
+===============================================================
+Transforms on Rotated Bounding Boxes
+===============================================================
+
+This example illustrates how to define and use rotated bounding boxes.
+
+.. note::
+    Support for rotated bounding boxes was released in TorchVision 0.23 and is
+    currently a BETA feature. We don't expect the API to change, but there may
+    be some rare edge-cases. If you find any issues, please report them on
+    our bug tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue
+
+First, a bit of setup code:
+"""
+
+# %%
+from PIL import Image
+from pathlib import Path
+import matplotlib.pyplot as plt
+
+
+import torch
+from torchvision.tv_tensors import BoundingBoxes
+from torchvision.transforms import v2
+from helpers import plot
+
+plt.rcParams["figure.figsize"] = [10, 5]
+plt.rcParams["savefig.bbox"] = "tight"
+
+# if you change the seed, make sure that the randomly-applied transforms
+# properly show that the image can be both transformed and *not* transformed!
+torch.manual_seed(0)
+
+# If you're trying to run that on Colab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+orig_img = Image.open(Path('../assets') / 'leaning_tower.jpg')
+
+# %%
+# Creating a Rotated Bounding Box
+# -------------------------------
+# Rotated bounding boxes are created by instantiating the
+# :class:`~torchvision.tv_tensors.BoundingBoxes` class. It's the ``format``
+# parameter of the constructor that determines if a bounding box is rotated or
+# not. In this instance, we use the CXCYWHR
+# :attr:`~torchvision.tv_tensors.BoundingBoxFormat`. The first two values are
+# the X and Y coordinates of the center of the bounding box.  The next two
+# values are the width and height of the bounding box, and the last value is the
+# rotation of the bounding box, in degrees.
+
+
+orig_box = BoundingBoxes(
+    [
+        [860.0, 1100, 570, 1840, -7],
+    ],
+    format="CXCYWHR",
+    canvas_size=(orig_img.size[1], orig_img.size[0]),
+)
+
+plot([(orig_img, orig_box)], bbox_width=10)
+
+# %%
+# Transforms illustrations
+# ------------------------
+#
+# Using :class:`~torchvision.transforms.RandomRotation`:
+rotater = v2.RandomRotation(degrees=(0, 180), expand=True)
+rotated_imgs = [rotater((orig_img, orig_box)) for _ in range(4)]
+plot([(orig_img, orig_box)] + rotated_imgs, bbox_width=10)
+
+# %%
+# Using :class:`~torchvision.transforms.Pad`:
+padded_imgs_and_boxes = [
+    v2.Pad(padding=padding)(orig_img, orig_box)
+    for padding in (30, 50, 100, 200)
+]
+plot([(orig_img, orig_box)] + padded_imgs_and_boxes, bbox_width=10)
+
+# %%
+# Using :class:`~torchvision.transforms.Resize`:
+resized_imgs = [
+    v2.Resize(size=size)(orig_img, orig_box)
+    for size in (30, 50, 100, orig_img.size)
+]
+plot([(orig_img, orig_box)] + resized_imgs, bbox_width=5)
+
+# %%
+# Note that the bounding box looking bigger in the images with less pixels is
+# an artifact, not reality. That is merely the rasterised representation of the
+# bounding box's boundaries appearing bigger because we specify a fixed width of
+# that rasterized line. When the image is, say, only 30 pixels wide, a
+# line that is 3 pixels wide is relatively large.
+#
+# .. _clamping_mode_tuto:
+#
+# Clamping Mode, and its effect on transforms
+# -------------------------------------------
+#
+# Some transforms, such as :class:`~torchvision.transforms.CenterCrop`, may
+# result in having the transformed bounding box partially outside of the
+# transformed (cropped) image. In general, this may happen on most of the
+# :ref:`geometric transforms <v2_api_ref>`.
+#
+# In such cases, the bounding box is clamped to the transformed image size based
+# on its ``clamping_mode`` attribute.  There are three values for
+# ``clamping_mode``, which determines how the box is clamped after a
+# transformation:
+#
+#  - ``None``: No clamping is applied, and the bounding box may be partially
+#    outside of the image.
+#  - `"hard"`:  The box is clamped to the image size, such that all its corners
+#    are within the image canvas. This potentially results in a loss of
+#    information, and it can lead to unintuitive resuts. But may be necessary
+#    for some applications e.g. if the model doesn't support boxes outside of
+#    their image.
+#  - `"soft"`: . This is an intermediate mode between ``None`` and "hard": the
+#    box is clamped, but not as strictly as in "hard" mode. Some box dimensions
+#    may still be outside of the image. This is the default when constucting
+#    :class:`~torchvision.tv_tensors.BoundingBoxes`.
+#
+# .. note::
+#
+#       For axis-aligned bounding boxes, the `"soft"` and `"hard"` modes behave
+#       the same, as the bounding box is always clamped to the image size.
+#
+# Let's illustrate the clamping modes with
+# :class:`~torchvision.transforms.CenterCrop` transform:
+
+assert orig_box.clamping_mode == "soft"
+
+box_hard_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode="hard")
+
+box_no_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode=None)
+
+crop_sizes = (800, 1200, 2000, orig_img.size)
+soft_center_crops_and_boxes = [
+    v2.CenterCrop(size=size)(orig_img, orig_box)
+    for size in crop_sizes
+]
+
+hard_center_crops_and_boxes = [
+    v2.CenterCrop(size=size)(orig_img, box_hard_clamping)
+    for size in crop_sizes
+]
+
+no_clamping_center_crops_and_boxes = [
+    v2.CenterCrop(size=size)(orig_img, box_no_clamping)
+    for size in crop_sizes
+]
+
+plot([[(orig_img, box_hard_clamping)] + hard_center_crops_and_boxes,
+      [(orig_img, orig_box)] + soft_center_crops_and_boxes,
+      [(orig_img, box_no_clamping)] + no_clamping_center_crops_and_boxes],
+     bbox_width=10)
+
+# %%
+# The plot above shows the "hard" clamping mode, "soft" and ``None``, in this
+# order. While "soft" and ``None`` result in similar plots, they do not lead to
+# the exact same clamped boxes. The non-clamped boxes will show dimensions that are further away from the image:
+print("boxes with soft clamping:")
+print(soft_center_crops_and_boxes)
+print()
+print("boxes with no clamping:")
+print(no_clamping_center_crops_and_boxes)
+
+# %%
+#
+# Setting the clamping mode
+# --------------------------
+#
+# The ``clamping_mode`` attribute, which determines the clamping strategy that
+# is applied to a box, can be set in different ways:
+#
+# - When constructing the bounding box with its
+#   :class:`~torchvision.tv_tensors.BoundingBoxes` constructor, as done in the example above.
+# - By directly setting the attribute on an existing instance, e.g. ``boxes.clamping_mode = "hard"``.
+# - By calling the :class:`~torchvision.transforms.v2.SetClampingMode` transform.
+#
+# Also, remember that you can always clamp the bounding box manually by
+# calling the :meth:`~torchvision.transforms.v2.ClampBoundingBoxes` transform!
+# Here's an example illustrating all of these option:
+
+t = v2.Compose([
+    v2.CenterCrop(size=(800,)),  # clamps according to the current clamping_mode
+                                 # attribute, in this case set by the constructor
+    v2.SetClampingMode(None),  # sets the clamping_mode attribute for future transforms
+    v2.Pad(padding=3),  # clamps according to the current clamping_mode
+                        # i.e. ``None``
+    v2.ClampBoundingBoxes(clamping_mode="soft"),  # clamps with "soft" mode.
+])
+
+out_img, out_box = t(orig_img, orig_box)
+plot([(orig_img, orig_box), (out_img, out_box)], bbox_width=10)
+
+# %%
diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py
@@ -79,12 +79,13 @@
 #     very easy: the v2 transforms are fully compatible with the v1 API, so you
 #     only need to change the import!
 #
-# Detection, Segmentation, Videos
+# Videos, boxes, masks, keypoints
 # -------------------------------
 #
-# The new Torchvision transforms in the ``torchvision.transforms.v2`` namespace
-# support tasks beyond image classification: they can also transform bounding
-# boxes, segmentation / detection masks, or videos.
+# The Torchvision transforms in the ``torchvision.transforms.v2`` namespace
+# support tasks beyond image classification: they can also transform rotated or
+# axis-aligned bounding boxes, segmentation / detection masks, videos, and
+# keypoints.
 #
 # Let's briefly look at a detection example with bounding boxes.
 
@@ -129,8 +130,9 @@
 # TVTensors are :class:`torch.Tensor` subclasses. The available TVTensors are
 # :class:`~torchvision.tv_tensors.Image`,
 # :class:`~torchvision.tv_tensors.BoundingBoxes`,
-# :class:`~torchvision.tv_tensors.Mask`, and
-# :class:`~torchvision.tv_tensors.Video`.
+# :class:`~torchvision.tv_tensors.Mask`,
+# :class:`~torchvision.tv_tensors.Video`, and
+# :class:`~torchvision.tv_tensors.KeyPoints`.
 #
 # TVTensors look and feel just like regular tensors - they **are** tensors.
 # Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()``
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
@@ -27,11 +27,10 @@ def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> t
 class ClampBoundingBoxes(Transform):
     """Clamp bounding boxes to their corresponding image dimensions.
 
-    The clamping is done according to the bounding boxes' ``canvas_size`` meta-data.
-
     Args:
-        clamping_mode: TODOBB more docs. Default is None which relies on the input box' clamping_mode attribute.
-
+        clamping_mode: Default is "auto" which relies on the input box'
+            ``clamping_mode`` attribute. Read more in :ref:`clamping_mode_tuto`
+            for more details on how to use this transform.
     """
 
     def __init__(self, clamping_mode: Union[CLAMPING_MODE_TYPE, str] = "auto") -> None:
@@ -57,7 +56,15 @@ def transform(self, inpt: tv_tensors.KeyPoints, params: dict[str, Any]) -> tv_te
 
 
 class SetClampingMode(Transform):
-    """TODOBB"""
+    """Sets the ``clamping_mode`` attribute of the bounding boxes for future transforms.
+
+
+
+    Args:
+        clamping_mode: The clamping mode to set. Possible values are: "soft",
+            "hard", or ``None``. Read more in :ref:`clamping_mode_tuto` for more
+            details on how to use this transform.
+    """
 
     def __init__(self, clamping_mode: CLAMPING_MODE_TYPE) -> None:
         super().__init__()
diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py
@@ -16,17 +16,20 @@ class BoundingBoxFormat(Enum):
 
     Available formats are:
 
-    * ``XYXY``
-    * ``XYWH``
-    * ``CXCYWH``
-    * ``XYWHR``: rotated boxes represented via corner, width and height, x1, y1
-      being top left, w, h being width and height. r is rotation angle in
+    * ``XYXY``: bounding box represented via corners; x1, y1 being top left;
+      x2, y2 being bottom right.
+    * ``XYWH``: bounding box represented via corner, width and height; x1, y1
+      being top left; w, h being width and height.
+    * ``CXCYWH``: bounding box represented via centre, width and height; cx,
+      cy being center of box; w, h being width and height.
+    * ``XYWHR``: rotated boxes represented via corner, width and height; x1, y1
+      being top left; w, h being width and height. r is rotation angle in
       degrees.
-    * ``CXCYWHR``: rotated boxes represented via centre, width and height, cx,
-      cy being center of box, w, h being width and height. r is rotation angle
+    * ``CXCYWHR``: rotated boxes represented via center, width and height; cx,
+      cy being center of box; w, h being width and height. r is rotation angle
       in degrees.
-    * ``XYXYXYXY``: rotated boxes represented via corners, x1, y1 being top
-      left, x2, y2 being top right, x3, y3 being bottom right, x4, y4 being
+    * ``XYXYXYXY``: rotated boxes represented via corners; x1, y1 being top
+      left; x2, y2 being top right; x3, y3 being bottom right; x4, y4 being
       bottom left.
     """
 
@@ -56,12 +59,17 @@ def is_rotated_bounding_format(format: BoundingBoxFormat | str) -> bool:
 # This should ideally be a Literal, but torchscript fails.
 CLAMPING_MODE_TYPE = Optional[str]
 
-# TODOBB All docs. Add any new API to rst files, add tutorial[s].
-
 
 class BoundingBoxes(TVTensor):
     """:class:`torch.Tensor` subclass for bounding boxes with shape ``[N, K]``.
 
+    .. note::
+        Support for rotated bounding boxes was released in TorchVision 0.23 and
+        is currently a BETA feature. We don't expect the API to change, but
+        there may be some rare edge-cases. If you find any issues, please report
+        them on our bug tracker:
+        https://github.com/pytorch/vision/issues?q=is:open+is:issue
+
     Where ``N`` is the number of bounding boxes
     and ``K`` is 4 for unrotated boxes, and 5 or 8 for rotated boxes.
 
@@ -75,7 +83,8 @@ class BoundingBoxes(TVTensor):
         data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
         format (BoundingBoxFormat, str): Format of the bounding box.
         canvas_size (two-tuple of ints): Height and width of the corresponding image or video.
-        clamping_mode: TODOBB
+        clamping_mode: The clamping mode to use when applying transforms that may result in bounding boxes
+            partially outside of the image. Possible values are: "soft", "hard", or ``None``. Read more in :ref:`clamping_mode_tuto`.
         dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
             ``data``.
         device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
diff --git a/torchvision/tv_tensors/_keypoints.py b/torchvision/tv_tensors/_keypoints.py