Skip to content

Commit 3ab6f75

Browse files
Merge branch 'main' into keypoints-tutorial
2 parents a07475e + a8dc530 commit 3ab6f75

File tree

11 files changed

+108
-49
lines changed

11 files changed

+108
-49
lines changed

.github/scripts/setup-env.sh

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,13 @@ case $(uname) in
2323
esac
2424

2525
echo '::group::Create build environment'
26-
# See https://github.com/pytorch/vision/issues/7296 for ffmpeg
2726
conda create \
2827
--name ci \
2928
--quiet --yes \
3029
python="${PYTHON_VERSION}" pip \
3130
ninja cmake \
3231
libpng \
33-
libwebp \
34-
'ffmpeg<4.3'
32+
libwebp
3533
conda activate ci
3634
conda install --quiet --yes libjpeg-turbo -c pytorch
3735
pip install --progress-bar=off --upgrade setuptools==72.1.0

CMakeLists.txt

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ option(WITH_JPEG "Enable features requiring LibJPEG." ON)
1111
# untested. Since building from cmake is very low pri anyway, this is OK. If
1212
# you're a user and you need this, please open an issue (and a PR!).
1313
option(WITH_WEBP "Enable features requiring LibWEBP." OFF)
14-
# Same here
15-
option(WITH_AVIF "Enable features requiring LibAVIF." OFF)
1614

1715
if(WITH_CUDA)
1816
enable_language(CUDA)
@@ -40,12 +38,7 @@ endif()
4038

4139
if (WITH_WEBP)
4240
add_definitions(-DWEBP_FOUND)
43-
find_package(WEBP REQUIRED)
44-
endif()
45-
46-
if (WITH_AVIF)
47-
add_definitions(-DAVIF_FOUND)
48-
find_package(AVIF REQUIRED)
41+
find_package(WebP REQUIRED)
4942
endif()
5043

5144
function(CUDA_CONVERT_FLAGS EXISTING_TARGET)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ versions.
2020

2121
| `torch` | `torchvision` | Python |
2222
| ------------------ | ------------------ | ------------------- |
23-
| `main` / `nightly` | `main` / `nightly` | `>=3.9`, `<=3.12` |
23+
| `main` / `nightly` | `main` / `nightly` | `>=3.10`, `<=3.13` |
2424
| `2.8` | `0.23` | `>=3.9`, `<=3.13` |
2525
| `2.7` | `0.22` | `>=3.9`, `<=3.13` |
2626
| `2.6` | `0.21` | `>=3.9`, `<=3.12` |

packaging/pre_build_script.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
1717
# Installing webp also installs a non-turbo jpeg, so we uninstall jpeg stuff
1818
# before re-installing them
1919
conda uninstall libjpeg-turbo libjpeg -y
20-
conda install -y ffmpeg=4.2 -c pytorch
2120
conda install -y libjpeg-turbo -c pytorch
2221

2322
# Copy binaries to be included in the wheel distribution
@@ -30,7 +29,7 @@ else
3029

3130
if [[ "$ARCH" == "aarch64" ]]; then
3231
conda install libpng -y
33-
conda install -y ffmpeg=4.2 libjpeg-turbo -c pytorch-nightly
32+
conda install -y libjpeg-turbo -c pytorch-nightly
3433
fi
3534

3635
conda install libwebp -y

setup.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import shutil
77
import subprocess
88
import sys
9+
import sysconfig
910
import warnings
1011
from pathlib import Path
1112

@@ -27,9 +28,9 @@
2728
# video decoding backends in torchvision. I'm renaming this to "gpu video
2829
# decoder" where possible, keeping user facing names (like the env var below) to
2930
# the old scheme for BC.
30-
USE_GPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "1") == "1"
31+
USE_GPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "0") == "1"
3132
# Same here: "use ffmpeg" was used to denote "use cpu video decoder".
32-
USE_CPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_FFMPEG", "1") == "1"
33+
USE_CPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_FFMPEG", "0") == "1"
3334

3435
TORCHVISION_INCLUDE = os.environ.get("TORCHVISION_INCLUDE", "")
3536
TORCHVISION_LIBRARY = os.environ.get("TORCHVISION_LIBRARY", "")
@@ -136,6 +137,8 @@ def get_macros_and_flags():
136137
if sys.platform == "win32":
137138
define_macros += [("torchvision_EXPORTS", None)]
138139
extra_compile_args["cxx"].append("/MP")
140+
if sysconfig.get_config_var("Py_GIL_DISABLED"):
141+
extra_compile_args["cxx"].append("-DPy_GIL_DISABLED")
139142

140143
if DEBUG:
141144
extra_compile_args["cxx"].append("-g")

test/test_transforms_v2.py

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3993,7 +3993,7 @@ class TestGaussianNoise:
39933993
"make_input",
39943994
[make_image_tensor, make_image, make_video],
39953995
)
3996-
def test_kernel(self, make_input):
3996+
def test_kernel_float(self, make_input):
39973997
check_kernel(
39983998
F.gaussian_noise,
39993999
make_input(dtype=torch.float32),
@@ -4005,9 +4005,28 @@ def test_kernel(self, make_input):
40054005
"make_input",
40064006
[make_image_tensor, make_image, make_video],
40074007
)
4008-
def test_functional(self, make_input):
4008+
def test_kernel_uint8(self, make_input):
4009+
check_kernel(
4010+
F.gaussian_noise,
4011+
make_input(dtype=torch.uint8),
4012+
# This cannot pass because the noise on a batch in not per-image
4013+
check_batched_vs_unbatched=False,
4014+
)
4015+
4016+
@pytest.mark.parametrize(
4017+
"make_input",
4018+
[make_image_tensor, make_image, make_video],
4019+
)
4020+
def test_functional_float(self, make_input):
40094021
check_functional(F.gaussian_noise, make_input(dtype=torch.float32))
40104022

4023+
@pytest.mark.parametrize(
4024+
"make_input",
4025+
[make_image_tensor, make_image, make_video],
4026+
)
4027+
def test_functional_uint8(self, make_input):
4028+
check_functional(F.gaussian_noise, make_input(dtype=torch.uint8))
4029+
40114030
@pytest.mark.parametrize(
40124031
("kernel", "input_type"),
40134032
[
@@ -4023,10 +4042,11 @@ def test_functional_signature(self, kernel, input_type):
40234042
"make_input",
40244043
[make_image_tensor, make_image, make_video],
40254044
)
4026-
def test_transform(self, make_input):
4045+
def test_transform_float(self, make_input):
40274046
def adapter(_, input, __):
4028-
# This transform doesn't support uint8 so we have to convert the auto-generated uint8 tensors to float32
4029-
# Same for PIL images
4047+
# We have two different implementations for floats and uint8
4048+
# To test this implementation we'll convert the auto-generated uint8 tensors to float32
4049+
# We don't support other int dtypes nor pil images
40304050
for key, value in input.items():
40314051
if isinstance(value, torch.Tensor) and not value.is_floating_point():
40324052
input[key] = value.to(torch.float32)
@@ -4036,11 +4056,29 @@ def adapter(_, input, __):
40364056

40374057
check_transform(transforms.GaussianNoise(), make_input(dtype=torch.float32), check_sample_input=adapter)
40384058

4059+
@pytest.mark.parametrize(
4060+
"make_input",
4061+
[make_image_tensor, make_image, make_video],
4062+
)
4063+
def test_transform_uint8(self, make_input):
4064+
def adapter(_, input, __):
4065+
# We have two different implementations for floats and uint8
4066+
# To test this implementation we'll convert every tensor to uint8
4067+
# We don't support other int dtypes nor pil images
4068+
for key, value in input.items():
4069+
if isinstance(value, torch.Tensor) and not value.dtype != torch.uint8:
4070+
input[key] = value.to(torch.uint8)
4071+
if isinstance(value, PIL.Image.Image):
4072+
input[key] = F.pil_to_tensor(value).to(torch.uint8)
4073+
return input
4074+
4075+
check_transform(transforms.GaussianNoise(), make_input(dtype=torch.uint8), check_sample_input=adapter)
4076+
40394077
def test_bad_input(self):
40404078
with pytest.raises(ValueError, match="Gaussian Noise is not implemented for PIL images."):
40414079
F.gaussian_noise(make_image_pil())
4042-
with pytest.raises(ValueError, match="Input tensor is expected to be in float dtype"):
4043-
F.gaussian_noise(make_image(dtype=torch.uint8))
4080+
with pytest.raises(ValueError, match="Input tensor is expected to be in uint8 or float dtype"):
4081+
F.gaussian_noise(make_image(dtype=torch.int32))
40444082
with pytest.raises(ValueError, match="sigma shouldn't be negative"):
40454083
F.gaussian_noise(make_image(dtype=torch.float32), sigma=-1)
40464084

torchvision/csrc/ops/cuda/nms_kernel.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ __global__ void nms_kernel_impl(
3434
double iou_threshold,
3535
const T* dev_boxes,
3636
unsigned long long* dev_mask) {
37-
const int row_start = blockIdx.y;
38-
const int col_start = blockIdx.x;
37+
const auto row_start = blockIdx.y;
38+
const auto col_start = blockIdx.x;
3939

4040
if (row_start > col_start)
4141
return;
@@ -59,7 +59,7 @@ __global__ void nms_kernel_impl(
5959
__syncthreads();
6060

6161
if (threadIdx.x < row_size) {
62-
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
62+
const auto cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
6363
const T* cur_box = dev_boxes + cur_box_idx * 4;
6464
int i = 0;
6565
unsigned long long t = 0;
@@ -84,7 +84,7 @@ __global__ static void gather_keep_from_mask(
8484
// Taken and adapted from mmcv
8585
// https://github.com/open-mmlab/mmcv/blob/03ce9208d18c0a63d7ffa087ea1c2f5661f2441a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh#L76
8686
const int col_blocks = ceil_div(n_boxes, threadsPerBlock);
87-
const int thread_id = threadIdx.x;
87+
const auto thread_id = threadIdx.x;
8888

8989
// Mark the bboxes which have been removed.
9090
extern __shared__ unsigned long long removed[];

torchvision/datasets/caltech.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import os
22
import os.path
3+
import shutil
34
from pathlib import Path
45
from typing import Any, Callable, Optional, Union
56

67
from PIL import Image
78

8-
from .utils import download_and_extract_archive, verify_str_arg
9+
from .utils import download_and_extract_archive, extract_archive, verify_str_arg
910
from .vision import VisionDataset
1011

1112

@@ -133,17 +134,17 @@ def download(self) -> None:
133134
return
134135

135136
download_and_extract_archive(
136-
"https://drive.google.com/file/d/137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp",
137-
self.root,
138-
filename="101_ObjectCategories.tar.gz",
139-
md5="b224c7392d521a49829488ab0f1120d9",
140-
)
141-
download_and_extract_archive(
142-
"https://drive.google.com/file/d/175kQy3UsZ0wUEHZjqkUDdNVssr7bgh_m",
143-
self.root,
144-
filename="Annotations.tar",
145-
md5="6f83eeb1f24d99cab4eb377263132c91",
137+
"https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip",
138+
download_root=self.root,
139+
filename="caltech-101.zip",
140+
md5="3138e1922a9193bfa496528edbbc45d0",
146141
)
142+
gzip_folder = os.path.join(self.root, "caltech-101")
143+
for gzip_file in os.listdir(gzip_folder):
144+
if gzip_file.endswith(".gz"):
145+
extract_archive(os.path.join(gzip_folder, gzip_file), self.root)
146+
shutil.rmtree(gzip_folder)
147+
os.remove(os.path.join(self.root, "caltech-101.zip"))
147148

148149
def extra_repr(self) -> str:
149150
return "Target type: {target_type}".format(**self.__dict__)
@@ -233,7 +234,7 @@ def download(self) -> None:
233234
return
234235

235236
download_and_extract_archive(
236-
"https://drive.google.com/file/d/1r6o0pSROcV1_VwT4oSjA2FBUSCWGuxLK",
237+
"https://data.caltech.edu/records/nyy15-4j048/files/256_ObjectCategories.tar",
237238
self.root,
238239
filename="256_ObjectCategories.tar",
239240
md5="67b4f42ca05d46448c6bb8ecd2220f6d",

torchvision/ops/drop_block.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ def drop_block2d(
3636

3737
N, C, H, W = input.size()
3838
block_size = min(block_size, W, H)
39+
if block_size % 2 == 0:
40+
raise ValueError(f"block size should be odd. Got {block_size} which is even.")
41+
3942
# compute the gamma of Bernoulli distribution
4043
gamma = (p * H * W) / ((block_size**2) * ((H - block_size + 1) * (W - block_size + 1)))
4144
noise = torch.empty((N, C, H - block_size + 1, W - block_size + 1), dtype=input.dtype, device=input.device)
@@ -82,6 +85,9 @@ def drop_block3d(
8285

8386
N, C, D, H, W = input.size()
8487
block_size = min(block_size, D, H, W)
88+
if block_size % 2 == 0:
89+
raise ValueError(f"block size should be odd. Got {block_size} which is even.")
90+
8591
# compute the gamma of Bernoulli distribution
8692
gamma = (p * D * H * W) / ((block_size**3) * ((D - block_size + 1) * (H - block_size + 1) * (W - block_size + 1)))
8793
noise = torch.empty(

torchvision/transforms/v2/_misc.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,13 +214,22 @@ class GaussianNoise(Transform):
214214
Each image or frame in a batch will be transformed independently i.e. the
215215
noise added to each image will be different.
216216
217-
The input tensor is also expected to be of float dtype in ``[0, 1]``.
218-
This transform does not support PIL images.
217+
The input tensor is also expected to be of float dtype in ``[0, 1]``,
218+
or of ``uint8`` dtype in ``[0, 255]``. This transform does not support PIL
219+
images.
220+
221+
Regardless of the dtype used, the parameters of the function use the same
222+
scale, so a ``mean`` parameter of 0.5 will result in an average value
223+
increase of 0.5 units for float images, and an average increase of 127.5
224+
units for ``uint8`` images.
219225
220226
Args:
221227
mean (float): Mean of the sampled normal distribution. Default is 0.
222228
sigma (float): Standard deviation of the sampled normal distribution. Default is 0.1.
223-
clip (bool, optional): Whether to clip the values in ``[0, 1]`` after adding noise. Default is True.
229+
clip (bool, optional): Whether to clip the values after adding noise, be it to
230+
``[0, 1]`` for floats or to ``[0, 255]`` for ``uint8``. Setting this parameter to
231+
``False`` may cause unsigned integer overflows with uint8 inputs.
232+
Default is True.
224233
"""
225234

226235
def __init__(self, mean: float = 0.0, sigma: float = 0.1, clip=True) -> None:

0 commit comments

Comments
 (0)