pytorch
diff --git a/‎.github/workflows/build-cmake.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build-cmake.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/prototype-tests-linux-gpu.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/prototype-tests-linux-gpu.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/transforms.rst‎
Lines changed: 3 additions & 1 deletion b/‎docs/source/transforms.rst‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/source/tv_tensors.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/tv_tensors.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎gallery/transforms/plot_tv_tensors.py‎
Lines changed: 10 additions & 1 deletion b/‎gallery/transforms/plot_tv_tensors.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎references/detection/coco_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎references/detection/coco_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/common_utils.py‎
Lines changed: 21 additions & 4 deletions b/‎test/common_utils.py‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎test/test_image.py‎
Lines changed: 27 additions & 0 deletions b/‎test/test_image.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎test/test_ops.py‎
Lines changed: 3 additions & 3 deletions b/‎test/test_ops.py‎
Lines changed: 3 additions & 3 deletions
@@ -18,7 +18,7 @@ jobs:
             gpu-arch-type: cpu
           - runner: linux.g5.4xlarge.nvidia.gpu
             gpu-arch-type: cuda
-            gpu-arch-version: "11.8"
+            gpu-arch-version: "12.6"
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
@@ -66,7 +66,7 @@ jobs:
             gpu-arch-type: cpu
           - runner: windows.g5.4xlarge.nvidia.gpu
             gpu-arch-type: cuda
-            gpu-arch-version: "11.8"
+            gpu-arch-version: "12.6"
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
     with:
 
@@ -21,7 +21,7 @@ jobs:
           - python-version: "3.9"
             runner: linux.g5.4xlarge.nvidia.gpu
             gpu-arch-type: cuda
-            gpu-arch-version: "11.8"
+            gpu-arch-version: "12.6"
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
 
@@ -24,7 +24,7 @@ jobs:
           - python-version: 3.9
             runner: linux.g5.4xlarge.nvidia.gpu
             gpu-arch-type: cuda
-            gpu-arch-version: "11.8"
+            gpu-arch-version: "12.6"
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
 
@@ -101,7 +101,7 @@ range of the inputs.
 V1 or V2? Which one should I use?
 ---------------------------------
 
-**TL;DR** We recommending using the ``torchvision.transforms.v2`` transforms
+**TL;DR** We recommend using the ``torchvision.transforms.v2`` transforms
 instead of those in ``torchvision.transforms``. They're faster and they can do
 more things. Just change the import and you should be good to go. Moving
 forward, new features and improvements will only be considered for the v2
@@ -408,6 +408,7 @@ Miscellaneous
     v2.Lambda
     v2.SanitizeBoundingBoxes
     v2.ClampBoundingBoxes
+    v2.ClampKeyPoints
     v2.UniformTemporalSubsample
     v2.JPEG
 
@@ -421,6 +422,7 @@ Functionals
     v2.functional.erase
     v2.functional.sanitize_bounding_boxes
     v2.functional.clamp_bounding_boxes
+    v2.functional.clamp_keypoints
     v2.functional.uniform_temporal_subsample
     v2.functional.jpeg
 
 
@@ -21,6 +21,7 @@ info.
 
     Image
     Video
+    KeyPoints
     BoundingBoxFormat
     BoundingBoxes
     Mask
 
@@ -46,11 +46,12 @@
 # Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
 # for the input data.
 #
-# :mod:`torchvision.tv_tensors` supports four types of TVTensors:
+# :mod:`torchvision.tv_tensors` supports five types of TVTensors:
 #
 # * :class:`~torchvision.tv_tensors.Image`
 # * :class:`~torchvision.tv_tensors.Video`
 # * :class:`~torchvision.tv_tensors.BoundingBoxes`
+# * :class:`~torchvision.tv_tensors.KeyPoints`
 # * :class:`~torchvision.tv_tensors.Mask`
 #
 # What can I do with a TVTensor?
@@ -96,6 +97,7 @@
 # :class:`~torchvision.tv_tensors.BoundingBoxes` requires the coordinate format as well as the size of the
 # corresponding image (``canvas_size``) alongside the actual values. These
 # metadata are required to properly transform the bounding boxes.
+# In a similar fashion, :class:`~torchvision.tv_tensors.KeyPoints` also require the ``canvas_size`` metadata to be added.
 
 bboxes = tv_tensors.BoundingBoxes(
     [[17, 16, 344, 495], [0, 10, 0, 10]],
@@ -104,6 +106,13 @@
 )
 print(bboxes)
 
+
+keypoints = tv_tensors.KeyPoints(
+    [[17, 16], [344, 495], [0, 10], [0, 10]],
+    canvas_size=image.shape[-2:]
+)
+print(keypoints)
+
 # %%
 # Using ``tv_tensors.wrap()``
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -123,7 +123,7 @@ def convert_to_coco_api(ds):
     coco_ds = COCO()
     # annotation IDs need to start at 1, not 0, see torchvision issue #1530
     ann_id = 1
-    dataset = {"images": [], "categories": [], "annotations": []}
+    dataset = {"images": [], "categories": [], "annotations": [], "info": {}}
     categories = set()
     for img_idx in range(len(ds)):
         # find better way to get target
 
@@ -21,7 +21,7 @@
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
 from torchvision import io, tv_tensors
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from torchvision.transforms.v2.functional import to_image, to_pil_image
+from torchvision.transforms.v2.functional import clamp_bounding_boxes, to_image, to_pil_image
 
 
 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
@@ -400,6 +400,12 @@ def make_image_pil(*args, **kwargs):
     return to_pil_image(make_image(*args, **kwargs))
 
 
+def make_keypoints(canvas_size=DEFAULT_SIZE, *, num_points=4, dtype=None, device="cpu"):
+    y = torch.randint(0, canvas_size[0], size=(num_points, 1), dtype=dtype, device=device)
+    x = torch.randint(0, canvas_size[1], size=(num_points, 1), dtype=dtype, device=device)
+    return tv_tensors.KeyPoints(torch.cat((x, y), dim=-1), canvas_size=canvas_size)
+
+
 def make_bounding_boxes(
     canvas_size=DEFAULT_SIZE,
     *,
@@ -461,9 +467,20 @@ def sample_position(values, max_value):
         parts = (x1, y1, x2, y2, x3, y3, x4, y4)
     else:
         raise ValueError(f"Format {format} is not supported")
-    return tv_tensors.BoundingBoxes(
-        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
-    )
+    out_boxes = torch.stack(parts, dim=-1).to(dtype=dtype, device=device)
+    if tv_tensors.is_rotated_bounding_format(format):
+        # The rotated bounding boxes are not guaranteed to be within the canvas by design,
+        # so we apply clamping. We also add a 2 buffer to the canvas size to avoid
+        # numerical issues during the testing
+        buffer = 4
+        out_boxes = clamp_bounding_boxes(
+            out_boxes, format=format, canvas_size=(canvas_size[0] - buffer, canvas_size[1] - buffer)
+        )
+        if format is tv_tensors.BoundingBoxFormat.XYWHR or format is tv_tensors.BoundingBoxFormat.CXCYWHR:
+            out_boxes[:, :2] += buffer // 2
+        elif format is tv_tensors.BoundingBoxFormat.XYXYXYXY:
+            out_boxes[:, :] += buffer // 2
+    return tv_tensors.BoundingBoxes(out_boxes, format=format, canvas_size=canvas_size)
 
 
 def make_detection_masks(size=DEFAULT_SIZE, *, num_masks=1, dtype=None, device="cpu"):
 
@@ -1,4 +1,5 @@
 import concurrent.futures
+import contextlib
 import glob
 import io
 import os
@@ -934,6 +935,32 @@ def test_decode_webp(decode_fun, scripted):
     img += 123  # make sure image buffer wasn't freed by underlying decoding lib
 
 
+@pytest.mark.parametrize("decode_fun", (decode_webp, decode_image))
+def test_decode_webp_grayscale(decode_fun, capfd):
+    encoded_bytes = read_file(next(get_images(FAKEDATA_DIR, ".webp")))
+
+    # We warn at the C++ layer because for decode_image(), we don't do the image
+    # type dispatch until we get to the C++ version of decode_image(). We could
+    # warn at the Python layer in decode_webp(), but then users would get a
+    # double wanring: one from the Python layer and one from the C++ layer.
+    #
+    # Because we use the TORCH_WARN_ONCE macro, we need to do this dance to
+    # temporarily always warn so we can test.
+    @contextlib.contextmanager
+    def set_always_warn():
+        torch._C._set_warnAlways(True)
+        yield
+        torch._C._set_warnAlways(False)
+
+    with set_always_warn():
+        img = decode_fun(encoded_bytes, mode=ImageReadMode.GRAY)
+        assert "Webp does not support grayscale conversions" in capfd.readouterr().err
+
+        # Note that because we do not support grayscale conversions, we expect
+        # that the number of color channels is still 3.
+        assert img.shape == (3, 100, 100)
+
+
 # This test is skipped by default because it requires webp images that we're not
 # including within the repo. The test images were downloaded manually from the
 # different pages of https://developers.google.com/speed/webp/gallery
 
@@ -929,6 +929,7 @@ def test_batched_nms_implementations(self, seed):
 
 class TestDeformConv:
     dtype = torch.float64
+    mps_dtype = torch.float32
 
     def expected_fn(self, x, weight, offset, mask, bias, stride=1, padding=0, dilation=1):
         stride_h, stride_w = _pair(stride)
@@ -1050,12 +1051,11 @@ def test_is_leaf_node(self, device):
         assert len(graph_node_names[0]) == len(graph_node_names[1])
         assert len(graph_node_names[0]) == 1 + op_obj.n_inputs
 
-    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("batch_sz", (0, 33))
-    @pytest.mark.opcheck_only_one()
     def test_forward(self, device, contiguous, batch_sz, dtype=None):
-        dtype = dtype or self.dtype
+        dtype = self.mps_dtype if device == "mps" else dtype or self.dtype
         x, _, offset, mask, _, stride, padding, dilation = self.get_fn_args(device, contiguous, batch_sz, dtype)
         in_channels = 6
         out_channels = 2