Merge branch 'main' into internallinters

AntoineSimoulin · web-flow · commit 1fc4ad32efd8 · 2025-09-09T08:10:20.000-05:00
diff --git a/setup.py b/setup.py
@@ -6,6 +6,7 @@
 import shutil
 import subprocess
 import sys
+import sysconfig
 import warnings
 from pathlib import Path
 
@@ -136,6 +137,8 @@ def get_macros_and_flags():
     if sys.platform == "win32":
         define_macros += [("torchvision_EXPORTS", None)]
         extra_compile_args["cxx"].append("/MP")
+        if sysconfig.get_config_var("Py_GIL_DISABLED"):
+            extra_compile_args["cxx"].append("-DPy_GIL_DISABLED")
 
     if DEBUG:
         extra_compile_args["cxx"].append("-g")
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
@@ -3993,7 +3993,7 @@ class TestGaussianNoise:
         "make_input",
         [make_image_tensor, make_image, make_video],
     )
-    def test_kernel(self, make_input):
+    def test_kernel_float(self, make_input):
         check_kernel(
             F.gaussian_noise,
             make_input(dtype=torch.float32),
@@ -4005,9 +4005,28 @@ def test_kernel(self, make_input):
         "make_input",
         [make_image_tensor, make_image, make_video],
     )
-    def test_functional(self, make_input):
+    def test_kernel_uint8(self, make_input):
+        check_kernel(
+            F.gaussian_noise,
+            make_input(dtype=torch.uint8),
+            # This cannot pass because the noise on a batch in not per-image
+            check_batched_vs_unbatched=False,
+        )
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_video],
+    )
+    def test_functional_float(self, make_input):
         check_functional(F.gaussian_noise, make_input(dtype=torch.float32))
 
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_video],
+    )
+    def test_functional_uint8(self, make_input):
+        check_functional(F.gaussian_noise, make_input(dtype=torch.uint8))
+
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
         [
@@ -4023,10 +4042,11 @@ def test_functional_signature(self, kernel, input_type):
         "make_input",
         [make_image_tensor, make_image, make_video],
     )
-    def test_transform(self, make_input):
+    def test_transform_float(self, make_input):
         def adapter(_, input, __):
-            # This transform doesn't support uint8 so we have to convert the auto-generated uint8 tensors to float32
-            # Same for PIL images
+            # We have two different implementations for floats and uint8
+            # To test this implementation we'll convert the auto-generated uint8 tensors to float32
+            # We don't support other int dtypes nor pil images
             for key, value in input.items():
                 if isinstance(value, torch.Tensor) and not value.is_floating_point():
                     input[key] = value.to(torch.float32)
@@ -4036,11 +4056,29 @@ def adapter(_, input, __):
 
         check_transform(transforms.GaussianNoise(), make_input(dtype=torch.float32), check_sample_input=adapter)
 
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_video],
+    )
+    def test_transform_uint8(self, make_input):
+        def adapter(_, input, __):
+            # We have two different implementations for floats and uint8
+            # To test this implementation we'll convert every tensor to uint8
+            # We don't support other int dtypes nor pil images
+            for key, value in input.items():
+                if isinstance(value, torch.Tensor) and not value.dtype != torch.uint8:
+                    input[key] = value.to(torch.uint8)
+                if isinstance(value, PIL.Image.Image):
+                    input[key] = F.pil_to_tensor(value).to(torch.uint8)
+            return input
+
+        check_transform(transforms.GaussianNoise(), make_input(dtype=torch.uint8), check_sample_input=adapter)
+
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Gaussian Noise is not implemented for PIL images."):
             F.gaussian_noise(make_image_pil())
-        with pytest.raises(ValueError, match="Input tensor is expected to be in float dtype"):
-            F.gaussian_noise(make_image(dtype=torch.uint8))
+        with pytest.raises(ValueError, match="Input tensor is expected to be in uint8 or float dtype"):
+            F.gaussian_noise(make_image(dtype=torch.int32))
         with pytest.raises(ValueError, match="sigma shouldn't be negative"):
             F.gaussian_noise(make_image(dtype=torch.float32), sigma=-1)
 
diff --git a/torchvision/datasets/caltech.py b/torchvision/datasets/caltech.py
@@ -1,11 +1,12 @@
 import os
 import os.path
+import shutil
 from pathlib import Path
 from typing import Any, Callable, Optional, Union
 
 from PIL import Image
 
-from .utils import download_and_extract_archive, verify_str_arg
+from .utils import download_and_extract_archive, extract_archive, verify_str_arg
 from .vision import VisionDataset
 
 
@@ -133,17 +134,17 @@ def download(self) -> None:
             return
 
         download_and_extract_archive(
-            "https://drive.google.com/file/d/137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp",
-            self.root,
-            filename="101_ObjectCategories.tar.gz",
-            md5="b224c7392d521a49829488ab0f1120d9",
-        )
-        download_and_extract_archive(
-            "https://drive.google.com/file/d/175kQy3UsZ0wUEHZjqkUDdNVssr7bgh_m",
-            self.root,
-            filename="Annotations.tar",
-            md5="6f83eeb1f24d99cab4eb377263132c91",
+            "https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip",
+            download_root=self.root,
+            filename="caltech-101.zip",
+            md5="3138e1922a9193bfa496528edbbc45d0",
         )
+        gzip_folder = os.path.join(self.root, "caltech-101")
+        for gzip_file in os.listdir(gzip_folder):
+            if gzip_file.endswith(".gz"):
+                extract_archive(os.path.join(gzip_folder, gzip_file), self.root)
+        shutil.rmtree(gzip_folder)
+        os.remove(os.path.join(self.root, "caltech-101.zip"))
 
     def extra_repr(self) -> str:
         return "Target type: {target_type}".format(**self.__dict__)
@@ -233,7 +234,7 @@ def download(self) -> None:
             return
 
         download_and_extract_archive(
-            "https://drive.google.com/file/d/1r6o0pSROcV1_VwT4oSjA2FBUSCWGuxLK",
+            "https://data.caltech.edu/records/nyy15-4j048/files/256_ObjectCategories.tar",
             self.root,
             filename="256_ObjectCategories.tar",
             md5="67b4f42ca05d46448c6bb8ecd2220f6d",
diff --git a/torchvision/ops/drop_block.py b/torchvision/ops/drop_block.py
@@ -36,6 +36,9 @@ def drop_block2d(
 
     N, C, H, W = input.size()
     block_size = min(block_size, W, H)
+    if block_size % 2 == 0:
+        raise ValueError(f"block size should be odd. Got {block_size} which is even.")
+
     # compute the gamma of Bernoulli distribution
     gamma = (p * H * W) / ((block_size**2) * ((H - block_size + 1) * (W - block_size + 1)))
     noise = torch.empty((N, C, H - block_size + 1, W - block_size + 1), dtype=input.dtype, device=input.device)
@@ -82,6 +85,9 @@ def drop_block3d(
 
     N, C, D, H, W = input.size()
     block_size = min(block_size, D, H, W)
+    if block_size % 2 == 0:
+        raise ValueError(f"block size should be odd. Got {block_size} which is even.")
+
     # compute the gamma of Bernoulli distribution
     gamma = (p * D * H * W) / ((block_size**3) * ((D - block_size + 1) * (H - block_size + 1) * (W - block_size + 1)))
     noise = torch.empty(
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
@@ -214,13 +214,22 @@ class GaussianNoise(Transform):
     Each image or frame in a batch will be transformed independently i.e. the
     noise added to each image will be different.
 
-    The input tensor is also expected to be of float dtype in ``[0, 1]``.
-    This transform does not support PIL images.
+    The input tensor is also expected to be of float dtype in ``[0, 1]``,
+    or of ``uint8`` dtype in ``[0, 255]``. This transform does not support PIL
+    images.
+
+    Regardless of the dtype used, the parameters of the function use the same
+    scale, so a ``mean`` parameter of 0.5 will result in an average value
+    increase of 0.5 units for float images, and an average increase of 127.5
+    units for ``uint8`` images.
 
     Args:
         mean (float): Mean of the sampled normal distribution. Default is 0.
         sigma (float): Standard deviation of the sampled normal distribution. Default is 0.1.
-        clip (bool, optional): Whether to clip the values in ``[0, 1]`` after adding noise. Default is True.
+        clip (bool, optional): Whether to clip the values after adding noise, be it to
+            ``[0, 1]`` for floats or to ``[0, 255]`` for ``uint8``. Setting this parameter to
+            ``False`` may cause unsigned integer overflows with uint8 inputs.
+            Default is True.
     """
 
     def __init__(self, mean: float = 0.0, sigma: float = 0.1, clip=True) -> None:
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
@@ -195,16 +195,28 @@ def gaussian_noise(inpt: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, cl
 @_register_kernel_internal(gaussian_noise, torch.Tensor)
 @_register_kernel_internal(gaussian_noise, tv_tensors.Image)
 def gaussian_noise_image(image: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor:
-    if not image.is_floating_point():
-        raise ValueError(f"Input tensor is expected to be in float dtype, got dtype={image.dtype}")
     if sigma < 0:
         raise ValueError(f"sigma shouldn't be negative. Got {sigma}")
 
-    noise = mean + torch.randn_like(image) * sigma
-    out = image + noise
-    if clip:
-        out = torch.clamp(out, 0, 1)
-    return out
+    if image.is_floating_point():
+        noise = mean + torch.randn_like(image) * sigma
+        out = image + noise
+        if clip:
+            out = torch.clamp(out, 0, 1)
+        return out
+
+    elif image.dtype == torch.uint8:
+        # Convert to intermediate dtype int16 to add to input more efficiently
+        # See https://github.com/pytorch/vision/pull/9169 for alternative implementations and benchmark
+        noise = ((mean * 255) + torch.randn_like(image, dtype=torch.float32) * (sigma * 255)).to(torch.int16)
+        out = image + noise
+
+        if clip:
+            out = torch.clamp(out, 0, 255)
+        return out.to(torch.uint8)
+
+    else:
+        raise ValueError(f"Input tensor is expected to be in uint8 or float dtype, got dtype={image.dtype}")
 
 
 @_register_kernel_internal(gaussian_noise, tv_tensors.Video)