Merge branch 'main' into keypoints-tutorial

AntoineSimoulin · web-flow · commit 3ab6f75bf5a2 · 2025-09-09T12:54:10.000-05:00
diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
@@ -23,15 +23,13 @@ case $(uname) in
 esac
 
 echo '::group::Create build environment'
-# See https://github.com/pytorch/vision/issues/7296 for ffmpeg
 conda create \
   --name ci \
   --quiet --yes \
   python="${PYTHON_VERSION}" pip \
   ninja cmake \
   libpng \
-  libwebp \
-  'ffmpeg<4.3'
+  libwebp
 conda activate ci
 conda install --quiet --yes libjpeg-turbo -c pytorch
 pip install --progress-bar=off --upgrade setuptools==72.1.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,8 +11,6 @@ option(WITH_JPEG "Enable features requiring LibJPEG." ON)
 # untested. Since building from cmake is very low pri anyway, this is OK. If
 # you're a user and you need this, please open an issue (and a PR!).
 option(WITH_WEBP "Enable features requiring LibWEBP." OFF)
-# Same here
-option(WITH_AVIF "Enable features requiring LibAVIF." OFF)
 
 if(WITH_CUDA)
   enable_language(CUDA)
@@ -40,12 +38,7 @@ endif()
 
 if (WITH_WEBP)
     add_definitions(-DWEBP_FOUND)
-    find_package(WEBP REQUIRED)
-endif()
-
-if (WITH_AVIF)
-    add_definitions(-DAVIF_FOUND)
-    find_package(AVIF REQUIRED)
+    find_package(WebP REQUIRED)
 endif()
 
 function(CUDA_CONVERT_FLAGS EXISTING_TARGET)
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ versions.
 
 | `torch`            | `torchvision`      | Python              |
 | ------------------ | ------------------ | ------------------- |
-| `main` / `nightly` | `main` / `nightly` | `>=3.9`, `<=3.12`   |
+| `main` / `nightly` | `main` / `nightly` | `>=3.10`, `<=3.13`   |
 | `2.8`              | `0.23`             | `>=3.9`, `<=3.13`   |
 | `2.7`              | `0.22`             | `>=3.9`, `<=3.13`   |
 | `2.6`              | `0.21`             | `>=3.9`, `<=3.12`   |
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
@@ -17,7 +17,6 @@ if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
   # Installing webp also installs a non-turbo jpeg, so we uninstall jpeg stuff
   # before re-installing them
   conda uninstall libjpeg-turbo libjpeg -y
-  conda install -y ffmpeg=4.2 -c pytorch
   conda install -y libjpeg-turbo -c pytorch
 
   # Copy binaries to be included in the wheel distribution
@@ -30,7 +29,7 @@ else
 
   if [[ "$ARCH" == "aarch64" ]]; then
     conda install libpng -y
-    conda install -y ffmpeg=4.2 libjpeg-turbo -c pytorch-nightly
+    conda install -y libjpeg-turbo -c pytorch-nightly
   fi
 
   conda install libwebp -y
diff --git a/setup.py b/setup.py
@@ -6,6 +6,7 @@
 import shutil
 import subprocess
 import sys
+import sysconfig
 import warnings
 from pathlib import Path
 
@@ -27,9 +28,9 @@
 # video decoding backends in torchvision. I'm renaming this to "gpu video
 # decoder" where possible, keeping user facing names (like the env var below) to
 # the old scheme for BC.
-USE_GPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "1") == "1"
+USE_GPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "0") == "1"
 # Same here: "use ffmpeg" was used to denote "use cpu video decoder".
-USE_CPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_FFMPEG", "1") == "1"
+USE_CPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_FFMPEG", "0") == "1"
 
 TORCHVISION_INCLUDE = os.environ.get("TORCHVISION_INCLUDE", "")
 TORCHVISION_LIBRARY = os.environ.get("TORCHVISION_LIBRARY", "")
@@ -136,6 +137,8 @@ def get_macros_and_flags():
     if sys.platform == "win32":
         define_macros += [("torchvision_EXPORTS", None)]
         extra_compile_args["cxx"].append("/MP")
+        if sysconfig.get_config_var("Py_GIL_DISABLED"):
+            extra_compile_args["cxx"].append("-DPy_GIL_DISABLED")
 
     if DEBUG:
         extra_compile_args["cxx"].append("-g")
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
@@ -3993,7 +3993,7 @@ class TestGaussianNoise:
         "make_input",
         [make_image_tensor, make_image, make_video],
     )
-    def test_kernel(self, make_input):
+    def test_kernel_float(self, make_input):
         check_kernel(
             F.gaussian_noise,
             make_input(dtype=torch.float32),
@@ -4005,9 +4005,28 @@ def test_kernel(self, make_input):
         "make_input",
         [make_image_tensor, make_image, make_video],
     )
-    def test_functional(self, make_input):
+    def test_kernel_uint8(self, make_input):
+        check_kernel(
+            F.gaussian_noise,
+            make_input(dtype=torch.uint8),
+            # This cannot pass because the noise on a batch in not per-image
+            check_batched_vs_unbatched=False,
+        )
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_video],
+    )
+    def test_functional_float(self, make_input):
         check_functional(F.gaussian_noise, make_input(dtype=torch.float32))
 
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_video],
+    )
+    def test_functional_uint8(self, make_input):
+        check_functional(F.gaussian_noise, make_input(dtype=torch.uint8))
+
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
         [
@@ -4023,10 +4042,11 @@ def test_functional_signature(self, kernel, input_type):
         "make_input",
         [make_image_tensor, make_image, make_video],
     )
-    def test_transform(self, make_input):
+    def test_transform_float(self, make_input):
         def adapter(_, input, __):
-            # This transform doesn't support uint8 so we have to convert the auto-generated uint8 tensors to float32
-            # Same for PIL images
+            # We have two different implementations for floats and uint8
+            # To test this implementation we'll convert the auto-generated uint8 tensors to float32
+            # We don't support other int dtypes nor pil images
             for key, value in input.items():
                 if isinstance(value, torch.Tensor) and not value.is_floating_point():
                     input[key] = value.to(torch.float32)
@@ -4036,11 +4056,29 @@ def adapter(_, input, __):
 
         check_transform(transforms.GaussianNoise(), make_input(dtype=torch.float32), check_sample_input=adapter)
 
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_video],
+    )
+    def test_transform_uint8(self, make_input):
+        def adapter(_, input, __):
+            # We have two different implementations for floats and uint8
+            # To test this implementation we'll convert every tensor to uint8
+            # We don't support other int dtypes nor pil images
+            for key, value in input.items():
+                if isinstance(value, torch.Tensor) and not value.dtype != torch.uint8:
+                    input[key] = value.to(torch.uint8)
+                if isinstance(value, PIL.Image.Image):
+                    input[key] = F.pil_to_tensor(value).to(torch.uint8)
+            return input
+
+        check_transform(transforms.GaussianNoise(), make_input(dtype=torch.uint8), check_sample_input=adapter)
+
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Gaussian Noise is not implemented for PIL images."):
             F.gaussian_noise(make_image_pil())
-        with pytest.raises(ValueError, match="Input tensor is expected to be in float dtype"):
-            F.gaussian_noise(make_image(dtype=torch.uint8))
+        with pytest.raises(ValueError, match="Input tensor is expected to be in uint8 or float dtype"):
+            F.gaussian_noise(make_image(dtype=torch.int32))
         with pytest.raises(ValueError, match="sigma shouldn't be negative"):
             F.gaussian_noise(make_image(dtype=torch.float32), sigma=-1)
 
diff --git a/torchvision/csrc/ops/cuda/nms_kernel.cu b/torchvision/csrc/ops/cuda/nms_kernel.cu
@@ -34,8 +34,8 @@ __global__ void nms_kernel_impl(
     double iou_threshold,
     const T* dev_boxes,
     unsigned long long* dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
+  const auto row_start = blockIdx.y;
+  const auto col_start = blockIdx.x;
 
   if (row_start > col_start)
     return;
@@ -59,7 +59,7 @@ __global__ void nms_kernel_impl(
   __syncthreads();
 
   if (threadIdx.x < row_size) {
-    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const auto cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
     const T* cur_box = dev_boxes + cur_box_idx * 4;
     int i = 0;
     unsigned long long t = 0;
@@ -84,7 +84,7 @@ __global__ static void gather_keep_from_mask(
   // Taken and adapted from mmcv
   // https://github.com/open-mmlab/mmcv/blob/03ce9208d18c0a63d7ffa087ea1c2f5661f2441a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh#L76
   const int col_blocks = ceil_div(n_boxes, threadsPerBlock);
-  const int thread_id = threadIdx.x;
+  const auto thread_id = threadIdx.x;
 
   // Mark the bboxes which have been removed.
   extern __shared__ unsigned long long removed[];
diff --git a/torchvision/datasets/caltech.py b/torchvision/datasets/caltech.py
@@ -1,11 +1,12 @@
 import os
 import os.path
+import shutil
 from pathlib import Path
 from typing import Any, Callable, Optional, Union
 
 from PIL import Image
 
-from .utils import download_and_extract_archive, verify_str_arg
+from .utils import download_and_extract_archive, extract_archive, verify_str_arg
 from .vision import VisionDataset
 
 
@@ -133,17 +134,17 @@ def download(self) -> None:
             return
 
         download_and_extract_archive(
-            "https://drive.google.com/file/d/137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp",
-            self.root,
-            filename="101_ObjectCategories.tar.gz",
-            md5="b224c7392d521a49829488ab0f1120d9",
-        )
-        download_and_extract_archive(
-            "https://drive.google.com/file/d/175kQy3UsZ0wUEHZjqkUDdNVssr7bgh_m",
-            self.root,
-            filename="Annotations.tar",
-            md5="6f83eeb1f24d99cab4eb377263132c91",
+            "https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip",
+            download_root=self.root,
+            filename="caltech-101.zip",
+            md5="3138e1922a9193bfa496528edbbc45d0",
         )
+        gzip_folder = os.path.join(self.root, "caltech-101")
+        for gzip_file in os.listdir(gzip_folder):
+            if gzip_file.endswith(".gz"):
+                extract_archive(os.path.join(gzip_folder, gzip_file), self.root)
+        shutil.rmtree(gzip_folder)
+        os.remove(os.path.join(self.root, "caltech-101.zip"))
 
     def extra_repr(self) -> str:
         return "Target type: {target_type}".format(**self.__dict__)
@@ -233,7 +234,7 @@ def download(self) -> None:
             return
 
         download_and_extract_archive(
-            "https://drive.google.com/file/d/1r6o0pSROcV1_VwT4oSjA2FBUSCWGuxLK",
+            "https://data.caltech.edu/records/nyy15-4j048/files/256_ObjectCategories.tar",
             self.root,
             filename="256_ObjectCategories.tar",
             md5="67b4f42ca05d46448c6bb8ecd2220f6d",
diff --git a/torchvision/ops/drop_block.py b/torchvision/ops/drop_block.py
@@ -36,6 +36,9 @@ def drop_block2d(
 
     N, C, H, W = input.size()
     block_size = min(block_size, W, H)
+    if block_size % 2 == 0:
+        raise ValueError(f"block size should be odd. Got {block_size} which is even.")
+
     # compute the gamma of Bernoulli distribution
     gamma = (p * H * W) / ((block_size**2) * ((H - block_size + 1) * (W - block_size + 1)))
     noise = torch.empty((N, C, H - block_size + 1, W - block_size + 1), dtype=input.dtype, device=input.device)
@@ -82,6 +85,9 @@ def drop_block3d(
 
     N, C, D, H, W = input.size()
     block_size = min(block_size, D, H, W)
+    if block_size % 2 == 0:
+        raise ValueError(f"block size should be odd. Got {block_size} which is even.")
+
     # compute the gamma of Bernoulli distribution
     gamma = (p * D * H * W) / ((block_size**3) * ((D - block_size + 1) * (H - block_size + 1) * (W - block_size + 1)))
     noise = torch.empty(
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
@@ -214,13 +214,22 @@ class GaussianNoise(Transform):
     Each image or frame in a batch will be transformed independently i.e. the
     noise added to each image will be different.
 
-    The input tensor is also expected to be of float dtype in ``[0, 1]``.
-    This transform does not support PIL images.
+    The input tensor is also expected to be of float dtype in ``[0, 1]``,
+    or of ``uint8`` dtype in ``[0, 255]``. This transform does not support PIL
+    images.
+
+    Regardless of the dtype used, the parameters of the function use the same
+    scale, so a ``mean`` parameter of 0.5 will result in an average value
+    increase of 0.5 units for float images, and an average increase of 127.5
+    units for ``uint8`` images.
 
     Args:
         mean (float): Mean of the sampled normal distribution. Default is 0.
         sigma (float): Standard deviation of the sampled normal distribution. Default is 0.1.
-        clip (bool, optional): Whether to clip the values in ``[0, 1]`` after adding noise. Default is True.
+        clip (bool, optional): Whether to clip the values after adding noise, be it to
+            ``[0, 1]`` for floats or to ``[0, 255]`` for ``uint8``. Setting this parameter to
+            ``False`` may cause unsigned integer overflows with uint8 inputs.
+            Default is True.
     """
 
     def __init__(self, mean: float = 0.0, sigma: float = 0.1, clip=True) -> None:
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
@@ -195,16 +195,28 @@ def gaussian_noise(inpt: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, cl
 @_register_kernel_internal(gaussian_noise, torch.Tensor)
 @_register_kernel_internal(gaussian_noise, tv_tensors.Image)
 def gaussian_noise_image(image: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor:
-    if not image.is_floating_point():
-        raise ValueError(f"Input tensor is expected to be in float dtype, got dtype={image.dtype}")
     if sigma < 0:
         raise ValueError(f"sigma shouldn't be negative. Got {sigma}")
 
-    noise = mean + torch.randn_like(image) * sigma
-    out = image + noise
-    if clip:
-        out = torch.clamp(out, 0, 1)
-    return out
+    if image.is_floating_point():
+        noise = mean + torch.randn_like(image) * sigma
+        out = image + noise
+        if clip:
+            out = torch.clamp(out, 0, 1)
+        return out
+
+    elif image.dtype == torch.uint8:
+        # Convert to intermediate dtype int16 to add to input more efficiently
+        # See https://github.com/pytorch/vision/pull/9169 for alternative implementations and benchmark
+        noise = ((mean * 255) + torch.randn_like(image, dtype=torch.float32) * (sigma * 255)).to(torch.int16)
+        out = image + noise
+
+        if clip:
+            out = torch.clamp(out, 0, 255)
+        return out.to(torch.uint8)
+
+    else:
+        raise ValueError(f"Input tensor is expected to be in uint8 or float dtype, got dtype={image.dtype}")
 
 
 @_register_kernel_internal(gaussian_noise, tv_tensors.Video)