diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c40096bf30b..4b2a61db51e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -40,34 +40,34 @@ jobs: exit 1 fi - c-source: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.7 - permissions: - id-token: write - contents: read - with: - repository: pytorch/vision - test-infra-ref: release/2.7 - script: | - set -euo pipefail - - echo '::group::Setup environment' - CONDA_PATH=$(which conda) - eval "$(${CONDA_PATH} shell.bash hook)" - conda create --name ci --quiet --yes -c conda-forge python=3.9 clang-format - conda activate ci - echo '::endgroup::' - - - echo '::group::Lint C source' - set +e - ./.github/scripts/run-clang-format.py -r torchvision/csrc --exclude "torchvision/csrc/io/image/cpu/giflib/*" - - if [ $? -ne 0 ]; then - git --no-pager diff - exit 1 - fi - echo '::endgroup::' + # c-source: + # uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + # permissions: + # id-token: write + # contents: read + # with: + # repository: pytorch/vision + # test-infra-ref: main + # script: | + # set -euo pipefail + + # echo '::group::Setup environment' + # CONDA_PATH=$(which conda) + # eval "$(${CONDA_PATH} shell.bash hook)" + # conda create --name ci --quiet --yes -c conda-forge python=3.9 clang-format + # conda activate ci + # echo '::endgroup::' + + + # echo '::group::Lint C source' + # set +e + # ./.github/scripts/run-clang-format.py -r torchvision/csrc --exclude "torchvision/csrc/io/image/cpu/giflib/*" + + # if [ $? -ne 0 ]; then + # git --no-pager diff + # exit 1 + # fi + # echo '::endgroup::' python-types: @@ -99,13 +99,14 @@ jobs: mypy --install-types --non-interactive --config-file mypy.ini echo '::endgroup::' - bc: - if: github.event.pull_request - runs-on: ubuntu-latest - steps: - - name: Run BC Lint Action - uses: pytorch/test-infra/.github/actions/bc-lint@release/2.7 - with: - repo: ${{ github.event.pull_request.head.repo.full_name }} - base_sha: ${{ github.event.pull_request.base.sha }} - head_sha: ${{ github.event.pull_request.head.sha }} + # bc: + # if: github.event.pull_request + # runs-on: ubuntu-latest + # steps: + # - name: Run BC Lint Action + # uses: pytorch/test-infra/.github/actions/bc-lint@main + # with: + # repo: ${{ github.event.pull_request.head.repo.full_name }} + # base_sha: ${{ github.event.pull_request.base.sha }} + # head_sha: ${{ github.event.pull_request.head.sha }} +# >>>>>>> 8a06122218 (Deactivate bc linter (#8999)) diff --git a/docs/source/io.rst b/docs/source/io.rst index c3f2d658014..1bfd10d9600 100644 --- a/docs/source/io.rst +++ b/docs/source/io.rst @@ -86,17 +86,16 @@ IO operations read_file write_file -Video ------ +Video - DEPREACTED +------------------ .. warning:: - Torchvision supports video decoding through different APIs listed below, - some of which are still in BETA stage. In the near future, we intend to - centralize PyTorch's video decoding capabilities within the `torchcodec - `_ project. We encourage you to try - it out and share your feedback, as the torchvision video decoders will - eventually be deprecated. + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch .. autosummary:: :toctree: generated/ diff --git a/gallery/others/plot_video_api.py b/gallery/others/plot_video_api.py deleted file mode 100644 index 3a67e4d86d0..00000000000 --- a/gallery/others/plot_video_api.py +++ /dev/null @@ -1,346 +0,0 @@ -""" -========= -Video API -========= - -.. note:: - Try on `Colab `_ - or :ref:`go to the end ` to download the full example code. - -This example illustrates some of the APIs that torchvision offers for -videos, together with the examples on how to build datasets and more. -""" - -# %% -# 1. Introduction: building a new video object and examining the properties -# ------------------------------------------------------------------------- -# First we select a video to test the object out. For the sake of argument -# we're using one from kinetics400 dataset. -# To create it, we need to define the path and the stream we want to use. - -# %% -# Chosen video statistics: -# -# - WUzgd7C1pWA.mp4 -# - source: -# - kinetics-400 -# - video: -# - H-264 -# - MPEG-4 AVC (part 10) (avc1) -# - fps: 29.97 -# - audio: -# - MPEG AAC audio (mp4a) -# - sample rate: 48K Hz -# - -import torch -import torchvision -from torchvision.datasets.utils import download_url -torchvision.set_video_backend("video_reader") - -# Download the sample video -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - ".", - "WUzgd7C1pWA.mp4" -) -video_path = "./WUzgd7C1pWA.mp4" - -# %% -# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form -# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int. -# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered. -# Firstly, let's get the metadata for our particular video: - -stream = "video" -video = torchvision.io.VideoReader(video_path, stream) -video.get_metadata() - -# %% -# Here we can see that video has two streams - a video and an audio stream. -# Currently available stream types include ['video', 'audio']. -# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id -# (which are determined by video encoding). -# In this way, if the video container contains multiple streams of the same type, -# users can access the one they want. -# If only stream type is passed, the decoder auto-detects first stream of that type and returns it. - -# %% -# Let's read all the frames from the video stream. By default, the return value of -# ``next(video_reader)`` is a dict containing the following fields. -# -# The return fields are: -# -# - ``data``: containing a torch.tensor -# - ``pts``: containing a float timestamp of this particular frame - -metadata = video.get_metadata() -video.set_current_stream("audio") - -frames = [] # we are going to save the frames here. -ptss = [] # pts is a presentation timestamp in seconds (float) of each frame -for frame in video: - frames.append(frame['data']) - ptss.append(frame['pts']) - -print("PTS for first five frames ", ptss[:5]) -print("Total number of frames: ", len(frames)) -approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0] -print("Approx total number of datapoints we can expect: ", approx_nf) -print("Read data size: ", frames[0].size(0) * len(frames)) - -# %% -# But what if we only want to read certain time segment of the video? -# That can be done easily using the combination of our ``seek`` function, and the fact that each call -# to next returns the presentation timestamp of the returned frame in seconds. -# -# Given that our implementation relies on python iterators, -# we can leverage itertools to simplify the process and make it more pythonic. -# -# For example, if we wanted to read ten frames from second second: - - -import itertools -video.set_current_stream("video") - -frames = [] # we are going to save the frames here. - -# We seek into a second second of the video and use islice to get 10 frames since -for frame, pts in itertools.islice(video.seek(2), 10): - frames.append(frame) - -print("Total number of frames: ", len(frames)) - -# %% -# Or if we wanted to read from 2nd to 5th second, -# We seek into a second second of the video, -# then we utilize the itertools takewhile to get the -# correct number of frames: - -video.set_current_stream("video") -frames = [] # we are going to save the frames here. -video = video.seek(2) - -for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video): - frames.append(frame['data']) - -print("Total number of frames: ", len(frames)) -approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] -print("We can expect approx: ", approx_nf) -print("Tensor size: ", frames[0].size()) - -# %% -# 2. Building a sample read_video function -# ---------------------------------------------------------------------------------------- -# We can utilize the methods above to build the read video function that follows -# the same API to the existing ``read_video`` function. - - -def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True): - if end is None: - end = float("inf") - if end < start: - raise ValueError( - "end time should be larger than start time, got " - f"start time={start} and end time={end}" - ) - - video_frames = torch.empty(0) - video_pts = [] - if read_video: - video_object.set_current_stream("video") - frames = [] - for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): - frames.append(frame['data']) - video_pts.append(frame['pts']) - if len(frames) > 0: - video_frames = torch.stack(frames, 0) - - audio_frames = torch.empty(0) - audio_pts = [] - if read_audio: - video_object.set_current_stream("audio") - frames = [] - for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): - frames.append(frame['data']) - audio_pts.append(frame['pts']) - if len(frames) > 0: - audio_frames = torch.cat(frames, 0) - - return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata() - - -# Total number of frames should be 327 for video and 523264 datapoints for audio -vf, af, info, meta = example_read_video(video) -print(vf.size(), af.size()) - -# %% -# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) -# ------------------------------------------------------------------------------------------------------- -# Cool, so now we can use the same principle to make the sample dataset. -# We suggest trying out iterable dataset for this purpose. -# Here, we are going to build an example dataset that reads randomly selected 10 frames of video. - -# %% -# Make sample dataset -import os -os.makedirs("./dataset", exist_ok=True) -os.makedirs("./dataset/1", exist_ok=True) -os.makedirs("./dataset/2", exist_ok=True) - -# %% -# Download the videos -from torchvision.datasets.utils import download_url -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - "./dataset/1", "WUzgd7C1pWA.mp4" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true", - "./dataset/1", - "RATRACE_wave_f_nm_np1_fr_goo_37.avi" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true", - "./dataset/2", - "SOX5yA1l24A.mp4" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true", - "./dataset/2", - "v_SoccerJuggling_g23_c01.avi" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true", - "./dataset/2", - "v_SoccerJuggling_g24_c01.avi" -) - -# %% -# Housekeeping and utilities -import os -import random - -from torchvision.datasets.folder import make_dataset -from torchvision import transforms as t - - -def _find_classes(dir): - classes = [d.name for d in os.scandir(dir) if d.is_dir()] - classes.sort() - class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} - return classes, class_to_idx - - -def get_samples(root, extensions=(".mp4", ".avi")): - _, class_to_idx = _find_classes(root) - return make_dataset(root, class_to_idx, extensions=extensions) - -# %% -# We are going to define the dataset and some basic arguments. -# We assume the structure of the FolderDataset, and add the following parameters: -# -# - ``clip_len``: length of a clip in frames -# - ``frame_transform``: transform for every frame individually -# - ``video_transform``: transform on a video sequence -# -# .. note:: -# We actually add epoch size as using :func:`~torch.utils.data.IterableDataset` -# class allows us to naturally oversample clips or images from each video if needed. - - -class RandomDataset(torch.utils.data.IterableDataset): - def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16): - super(RandomDataset).__init__() - - self.samples = get_samples(root) - - # Allow for temporal jittering - if epoch_size is None: - epoch_size = len(self.samples) - self.epoch_size = epoch_size - - self.clip_len = clip_len - self.frame_transform = frame_transform - self.video_transform = video_transform - - def __iter__(self): - for i in range(self.epoch_size): - # Get random sample - path, target = random.choice(self.samples) - # Get video object - vid = torchvision.io.VideoReader(path, "video") - metadata = vid.get_metadata() - video_frames = [] # video frame buffer - - # Seek and return frames - max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0]) - start = random.uniform(0., max_seek) - for frame in itertools.islice(vid.seek(start), self.clip_len): - video_frames.append(self.frame_transform(frame['data'])) - current_pts = frame['pts'] - # Stack it into a tensor - video = torch.stack(video_frames, 0) - if self.video_transform: - video = self.video_transform(video) - output = { - 'path': path, - 'video': video, - 'target': target, - 'start': start, - 'end': current_pts} - yield output - -# %% -# Given a path of videos in a folder structure, i.e: -# -# - dataset -# - class 1 -# - file 0 -# - file 1 -# - ... -# - class 2 -# - file 0 -# - file 1 -# - ... -# - ... -# -# We can generate a dataloader and test the dataset. - - -transforms = [t.Resize((112, 112))] -frame_transform = t.Compose(transforms) - -dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) - -# %% -from torch.utils.data import DataLoader -loader = DataLoader(dataset, batch_size=12) -data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} -for batch in loader: - for i in range(len(batch['path'])): - data['video'].append(batch['path'][i]) - data['start'].append(batch['start'][i].item()) - data['end'].append(batch['end'][i].item()) - data['tensorsize'].append(batch['video'][i].size()) -print(data) - -# %% -# 4. Data Visualization -# ---------------------------------- -# Example of visualized video - -import matplotlib.pyplot as plt - -plt.figure(figsize=(12, 12)) -for i in range(16): - plt.subplot(4, 4, i + 1) - plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) - plt.axis("off") - -# %% -# Cleanup the video and dataset: -import os -import shutil -os.remove("./WUzgd7C1pWA.mp4") -shutil.rmtree("./dataset") diff --git a/test/test_datasets.py b/test/test_datasets.py index 1413d2c312d..feaabd7acd2 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -11,6 +11,7 @@ import re import shutil import string +import sys import unittest import xml.etree.ElementTree as ET import zipfile @@ -1146,6 +1147,7 @@ class OmniglotTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Omniglot ADDITIONAL_CONFIGS = combinations_grid(background=(True, False)) + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir, config): target_folder = ( @@ -1902,6 +1904,7 @@ def test_class_to_idx(self): assert dataset.class_to_idx == class_to_idx +@pytest.mark.skipif(sys.platform in ("win32", "cygwin"), reason="temporarily disabled on Windows") class INaturalistTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.INaturalist FEATURE_TYPES = (PIL.Image.Image, (int, tuple)) @@ -1910,6 +1913,7 @@ class INaturalistTestCase(datasets_utils.ImageDatasetTestCase): target_type=("kingdom", "full", "genus", ["kingdom", "phylum", "class", "order", "family", "genus", "full"]), version=("2021_train",), ) + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir, config): categories = [ @@ -2038,6 +2042,8 @@ class SintelTestCase(datasets_utils.ImageDatasetTestCase): FLOW_H, FLOW_W = 3, 4 + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir, config): root = pathlib.Path(tmpdir) / "Sintel" @@ -2104,6 +2110,8 @@ class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase): ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir, config): root = pathlib.Path(tmpdir) / "KittiFlow" @@ -2223,6 +2231,8 @@ class FlyingThings3DTestCase(datasets_utils.ImageDatasetTestCase): FLOW_H, FLOW_W = 3, 4 + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir, config): root = pathlib.Path(tmpdir) / "FlyingThings3D" @@ -2289,6 +2299,8 @@ def test_bad_input(self): class HD1KTestCase(KittiFlowTestCase): DATASET_CLASS = datasets.HD1K + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir, config): root = pathlib.Path(tmpdir) / "hd1k" diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index ddd7ebf4e6f..8fa08fa6755 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -3013,12 +3013,18 @@ def test_errors(self): with pytest.raises(ValueError, match="Please provide only two dimensions"): transforms.RandomCrop([10, 12, 14]) - with pytest.raises(TypeError, match="Got inappropriate padding arg"): + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): transforms.RandomCrop([10, 12], padding="abc") with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7]) + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.RandomCrop([10, 12], padding=0.5) + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.RandomCrop([10, 12], padding=[0.5, 0.5]) + with pytest.raises(TypeError, match="Got inappropriate fill arg"): transforms.RandomCrop([10, 12], padding=1, fill="abc") @@ -3505,6 +3511,14 @@ def test_aug_mix_severity_error(self, severity): with pytest.raises(ValueError, match="severity must be between"): transforms.AugMix(severity=severity) + @pytest.mark.parametrize("num_ops", [-1, 1.1]) + def test_rand_augment_num_ops_error(self, num_ops): + with pytest.raises( + ValueError, + match=re.escape(f"num_ops should be a non-negative integer, but got {num_ops} instead."), + ): + transforms.RandAugment(num_ops=num_ops) + class TestConvertBoundingBoxFormat: old_new_formats = list(itertools.permutations(iter(tv_tensors.BoundingBoxFormat), 2)) @@ -3870,12 +3884,18 @@ def test_transform(self, make_input): check_transform(transforms.Pad(padding=[1]), make_input()) def test_transform_errors(self): - with pytest.raises(TypeError, match="Got inappropriate padding arg"): + with pytest.raises(ValueError, match="Padding must be"): transforms.Pad("abc") - with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4 element of tuple or list"): transforms.Pad([-0.7, 0, 0.7]) + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4 element of tuple or list"): + transforms.Pad(0.5) + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4 element of tuple or list"): + transforms.Pad(padding=[0.5, 0.5]) + with pytest.raises(TypeError, match="Got inappropriate fill arg"): transforms.Pad(12, fill="abc") @@ -4594,6 +4614,14 @@ def test_correctness_image(self, bits, fn): assert_equal(actual, expected) + @pytest.mark.parametrize("bits", [-1, 9, 2.1]) + def test_error_functional(self, bits): + with pytest.raises( + TypeError, + match=re.escape(f"bits must be a positive integer in the range [0, 8], got {bits} instead."), + ): + F.posterize(make_image(dtype=torch.uint8), bits=bits) + class TestSolarize: def _make_threshold(self, input, *, factor=0.5): @@ -6176,6 +6204,11 @@ def test_transform_invalid_quality_error(self, quality): with pytest.raises(ValueError, match="quality must be an integer from 1 to 100"): transforms.JPEG(quality=quality) + @pytest.mark.parametrize("quality", [None, True]) + def test_transform_quality_type_error(self, quality): + with pytest.raises(TypeError, match="quality"): + transforms.JPEG(quality=quality) + class TestUtils: # TODO: Still need to test has_all, has_any, check_type and get_bouding_boxes diff --git a/torchvision/datasets/_optical_flow.py b/torchvision/datasets/_optical_flow.py index e8d6247f03f..9ee4c4df52f 100644 --- a/torchvision/datasets/_optical_flow.py +++ b/torchvision/datasets/_optical_flow.py @@ -3,13 +3,14 @@ from abc import ABC, abstractmethod from glob import glob from pathlib import Path -from typing import Callable, List, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple, Union import numpy as np import torch from PIL import Image from ..io.image import decode_png, read_file +from .folder import default_loader from .utils import _read_pfm, verify_str_arg from .vision import VisionDataset @@ -32,19 +33,22 @@ class FlowDataset(ABC, VisionDataset): # and it's up to whatever consumes the dataset to decide what valid_flow_mask should be. _has_builtin_flow_mask = False - def __init__(self, root: Union[str, Path], transforms: Optional[Callable] = None) -> None: + def __init__( + self, + root: Union[str, Path], + transforms: Optional[Callable] = None, + loader: Callable[[str], Any] = default_loader, + ) -> None: super().__init__(root=root) self.transforms = transforms self._flow_list: List[str] = [] self._image_list: List[List[str]] = [] + self._loader = loader - def _read_img(self, file_name: str) -> Image.Image: - img = Image.open(file_name) - if img.mode != "RGB": - img = img.convert("RGB") # type: ignore[assignment] - return img + def _read_img(self, file_name: str) -> Union[Image.Image, torch.Tensor]: + return self._loader(file_name) @abstractmethod def _read_flow(self, file_name: str): @@ -70,9 +74,9 @@ def __getitem__(self, index: int) -> Union[T1, T2]: if self._has_builtin_flow_mask or valid_flow_mask is not None: # The `or valid_flow_mask is not None` part is here because the mask can be generated within a transform - return img1, img2, flow, valid_flow_mask + return img1, img2, flow, valid_flow_mask # type: ignore[return-value] else: - return img1, img2, flow + return img1, img2, flow # type: ignore[return-value] def __len__(self) -> int: return len(self._image_list) @@ -120,6 +124,9 @@ class Sintel(FlowDataset): ``img1, img2, flow, valid_flow_mask`` and returns a transformed version. ``valid_flow_mask`` is expected for consistency with other datasets which return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`. + loader (callable, optional): A function to load an image given its path. + By default, it uses PIL as its image loader, but users could also pass in + ``torchvision.io.decode_image`` for decoding image data into tensors directly. """ def __init__( @@ -128,8 +135,9 @@ def __init__( split: str = "train", pass_name: str = "clean", transforms: Optional[Callable] = None, + loader: Callable[[str], Any] = default_loader, ) -> None: - super().__init__(root=root, transforms=transforms) + super().__init__(root=root, transforms=transforms, loader=loader) verify_str_arg(split, "split", valid_values=("train", "test")) verify_str_arg(pass_name, "pass_name", valid_values=("clean", "final", "both")) @@ -186,12 +194,21 @@ class KittiFlow(FlowDataset): split (string, optional): The dataset split, either "train" (default) or "test" transforms (callable, optional): A function/transform that takes in ``img1, img2, flow, valid_flow_mask`` and returns a transformed version. + loader (callable, optional): A function to load an image given its path. + By default, it uses PIL as its image loader, but users could also pass in + ``torchvision.io.decode_image`` for decoding image data into tensors directly. """ _has_builtin_flow_mask = True - def __init__(self, root: Union[str, Path], split: str = "train", transforms: Optional[Callable] = None) -> None: - super().__init__(root=root, transforms=transforms) + def __init__( + self, + root: Union[str, Path], + split: str = "train", + transforms: Optional[Callable] = None, + loader: Callable[[str], Any] = default_loader, + ) -> None: + super().__init__(root=root, transforms=transforms, loader=loader) verify_str_arg(split, "split", valid_values=("train", "test")) @@ -324,6 +341,9 @@ class FlyingThings3D(FlowDataset): ``img1, img2, flow, valid_flow_mask`` and returns a transformed version. ``valid_flow_mask`` is expected for consistency with other datasets which return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`. + loader (callable, optional): A function to load an image given its path. + By default, it uses PIL as its image loader, but users could also pass in + ``torchvision.io.decode_image`` for decoding image data into tensors directly. """ def __init__( @@ -333,8 +353,9 @@ def __init__( pass_name: str = "clean", camera: str = "left", transforms: Optional[Callable] = None, + loader: Callable[[str], Any] = default_loader, ) -> None: - super().__init__(root=root, transforms=transforms) + super().__init__(root=root, transforms=transforms, loader=loader) verify_str_arg(split, "split", valid_values=("train", "test")) split = split.upper() @@ -414,12 +435,21 @@ class HD1K(FlowDataset): split (string, optional): The dataset split, either "train" (default) or "test" transforms (callable, optional): A function/transform that takes in ``img1, img2, flow, valid_flow_mask`` and returns a transformed version. + loader (callable, optional): A function to load an image given its path. + By default, it uses PIL as its image loader, but users could also pass in + ``torchvision.io.decode_image`` for decoding image data into tensors directly. """ _has_builtin_flow_mask = True - def __init__(self, root: Union[str, Path], split: str = "train", transforms: Optional[Callable] = None) -> None: - super().__init__(root=root, transforms=transforms) + def __init__( + self, + root: Union[str, Path], + split: str = "train", + transforms: Optional[Callable] = None, + loader: Callable[[str], Any] = default_loader, + ) -> None: + super().__init__(root=root, transforms=transforms, loader=loader) verify_str_arg(split, "split", valid_values=("train", "test")) diff --git a/torchvision/datasets/inaturalist.py b/torchvision/datasets/inaturalist.py index e041d41f4a2..8713bc041db 100644 --- a/torchvision/datasets/inaturalist.py +++ b/torchvision/datasets/inaturalist.py @@ -62,6 +62,9 @@ class INaturalist(VisionDataset): download (bool, optional): If true, downloads the dataset from the internet and puts it in root directory. If dataset is already downloaded, it is not downloaded again. + loader (callable, optional): A function to load an image given its path. + By default, it uses PIL as its image loader, but users could also pass in + ``torchvision.io.decode_image`` for decoding image data into tensors directly. """ def __init__( @@ -72,6 +75,7 @@ def __init__( transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False, + loader: Optional[Callable[[Union[str, Path]], Any]] = None, ) -> None: self.version = verify_str_arg(version, "version", DATASET_URLS.keys()) @@ -109,6 +113,8 @@ def __init__( for fname in files: self.index.append((dir_index, fname)) + self.loader = loader or Image.open + def _init_2021(self) -> None: """Initialize based on 2021 layout""" @@ -178,7 +184,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: """ cat_id, fname = self.index[index] - img = Image.open(os.path.join(self.root, self.all_categories[cat_id], fname)) + img = self.loader(os.path.join(self.root, self.all_categories[cat_id], fname)) target: Any = [] for t in self.target_type: diff --git a/torchvision/datasets/omniglot.py b/torchvision/datasets/omniglot.py index c3434a72456..f8d182cdb25 100644 --- a/torchvision/datasets/omniglot.py +++ b/torchvision/datasets/omniglot.py @@ -23,6 +23,9 @@ class Omniglot(VisionDataset): download (bool, optional): If true, downloads the dataset zip files from the internet and puts it in root directory. If the zip files are already downloaded, they are not downloaded again. + loader (callable, optional): A function to load an image given its path. + By default, it uses PIL as its image loader, but users could also pass in + ``torchvision.io.decode_image`` for decoding image data into tensors directly. """ folder = "omniglot-py" @@ -39,6 +42,7 @@ def __init__( transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False, + loader: Optional[Callable[[Union[str, Path]], Any]] = None, ) -> None: super().__init__(join(root, self.folder), transform=transform, target_transform=target_transform) self.background = background @@ -59,6 +63,7 @@ def __init__( for idx, character in enumerate(self._characters) ] self._flat_character_images: List[Tuple[str, int]] = sum(self._character_images, []) + self.loader = loader def __len__(self) -> int: return len(self._flat_character_images) @@ -73,7 +78,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: """ image_name, character_class = self._flat_character_images[index] image_path = join(self.target_folder, self._characters[character_class], image_name) - image = Image.open(image_path, mode="r").convert("L") + image = Image.open(image_path, mode="r").convert("L") if self.loader is None else self.loader(image_path) if self.transform: image = self.transform(image) diff --git a/torchvision/io/_video_deprecation_warning.py b/torchvision/io/_video_deprecation_warning.py new file mode 100644 index 00000000000..ea01d976110 --- /dev/null +++ b/torchvision/io/_video_deprecation_warning.py @@ -0,0 +1,12 @@ +import warnings + + +def _raise_video_deprecation_warning(): + warnings.warn( + "The video decoding and encoding capabilities of torchvision " + "are deprecated from version 0.22 and will be removed in version 0.24. " + "We recommend that you migrate to TorchCodec, where we'll consolidate " + "the future decoding/encoding capabilities of PyTorch: " + "https://github.com/pytorch/torchcodec", + UserWarning, + ) diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py index 69af045e773..7957ce3899a 100644 --- a/torchvision/io/_video_opt.py +++ b/torchvision/io/_video_opt.py @@ -6,6 +6,7 @@ import torch from ..extension import _load_library +from ._video_deprecation_warning import _raise_video_deprecation_warning try: @@ -185,6 +186,7 @@ def _read_video_from_file( info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int) """ + _raise_video_deprecation_warning() _validate_pts(video_pts_range) _validate_pts(audio_pts_range) @@ -256,6 +258,7 @@ def _probe_video_from_file(filename: str) -> VideoMetaData: """ Probe a video file and return VideoMetaData with info about the video """ + _raise_video_deprecation_warning() result = torch.ops.video_reader.probe_video_from_file(filename) vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration) @@ -331,6 +334,7 @@ def _read_video_from_memory( `K` is the number of channels """ + _raise_video_deprecation_warning() _validate_pts(video_pts_range) _validate_pts(audio_pts_range) @@ -405,6 +409,7 @@ def _read_video_timestamps_from_memory( 0, # audio_timebase_num 1, # audio_timebase_den ) + _raise_video_deprecation_warning() _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration) @@ -420,6 +425,7 @@ def _probe_video_from_memory( Probe a video in memory and return VideoMetaData with info about the video This function is torchscriptable """ + _raise_video_deprecation_warning() if not isinstance(video_data, torch.Tensor): with warnings.catch_warnings(): # Ignore the warning because we actually don't modify the buffer in this function @@ -437,6 +443,7 @@ def _read_video( end_pts: Optional[Union[float, Fraction]] = None, pts_unit: str = "pts", ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, float]]: + _raise_video_deprecation_warning() if end_pts is None: end_pts = float("inf") @@ -495,6 +502,7 @@ def get_pts(time_base): def _read_video_timestamps( filename: str, pts_unit: str = "pts" ) -> Tuple[Union[List[int], List[Fraction]], Optional[float]]: + _raise_video_deprecation_warning() if pts_unit == "pts": warnings.warn( "The pts_unit 'pts' gives wrong results and will be removed in a " diff --git a/torchvision/io/video.py b/torchvision/io/video.py index 2e3dbed65a2..be01831ceb5 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -11,6 +11,7 @@ from ..utils import _log_api_usage_once from . import _video_opt +from ._video_deprecation_warning import _raise_video_deprecation_warning try: import av @@ -66,21 +67,21 @@ def write_video( audio_options: Optional[Dict[str, Any]] = None, ) -> None: """ - Writes a 4d tensor in [T, H, W, C] format in a video file. + [DEPRECATED] Writes a 4d tensor in [T, H, W, C] format in a video file. + + .. warning:: + + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch This function relies on PyAV (therefore, ultimately FFmpeg) to encode videos, you can get more fine-grained control by referring to the other options at your disposal within `the FFMpeg wiki `_. - .. warning:: - - In the near future, we intend to centralize PyTorch's video decoding - capabilities within the `torchcodec - `_ project. We encourage you to - try it out and share your feedback, as the torchvision video decoders - will eventually be deprecated. - Args: filename (str): path where the video will be saved video_array (Tensor[T, H, W, C]): tensor containing the individual frames, @@ -107,6 +108,7 @@ def write_video( >>> write_video("video.mp4", options = {"crf": "17"}) """ + _raise_video_deprecation_warning() if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(write_video) _check_av_available() @@ -276,16 +278,15 @@ def read_video( pts_unit: str = "pts", output_format: str = "THWC", ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]: - """ - Reads a video from a file, returning both the video frames and the audio frames + """[DEPRECATED] Reads a video from a file, returning both the video frames and the audio frames .. warning:: - In the near future, we intend to centralize PyTorch's video decoding - capabilities within the `torchcodec - `_ project. We encourage you to - try it out and share your feedback, as the torchvision video decoders - will eventually be deprecated. + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch Args: filename (str): path to the video file. If using the pyav backend, this can be whatever ``av.open`` accepts. @@ -302,6 +303,7 @@ def read_video( aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int) """ + _raise_video_deprecation_warning() if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(read_video) @@ -408,16 +410,15 @@ def _decode_video_timestamps(container: "av.container.Container") -> List[int]: def read_video_timestamps(filename: str, pts_unit: str = "pts") -> Tuple[List[int], Optional[float]]: - """ - List the video frames timestamps. + """[DEPREACTED] List the video frames timestamps. .. warning:: - In the near future, we intend to centralize PyTorch's video decoding - capabilities within the `torchcodec - `_ project. We encourage you to - try it out and share your feedback, as the torchvision video decoders - will eventually be deprecated. + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch Note that the function decodes the whole video frame-by-frame. @@ -432,6 +433,7 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> Tuple[List[in video_fps (float, optional): the frame rate for the video """ + _raise_video_deprecation_warning() if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(read_video_timestamps) from torchvision import get_video_backend diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py index cf319fe288e..5096b6ba324 100644 --- a/torchvision/io/video_reader.py +++ b/torchvision/io/video_reader.py @@ -6,6 +6,7 @@ import torch from ..utils import _log_api_usage_once +from ._video_deprecation_warning import _raise_video_deprecation_warning from ._video_opt import _HAS_CPU_VIDEO_DECODER @@ -45,8 +46,7 @@ def _has_video_opt() -> bool: class VideoReader: - """ - Fine-grained video-reading API. + """[DEPRECATED] Fine-grained video-reading API. Supports frame-by-frame reading of various streams from a single video container. Much like previous video_reader API it supports the following backends: video_reader, pyav, and cuda. @@ -54,11 +54,11 @@ class VideoReader: .. warning:: - In the near future, we intend to centralize PyTorch's video decoding - capabilities within the `torchcodec - `_ project. We encourage you to - try it out and share your feedback, as the torchvision video decoders - will eventually be deprecated. + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch .. betastatus:: VideoReader class @@ -125,6 +125,7 @@ def __init__( stream: str = "video", num_threads: int = 0, ) -> None: + _raise_video_deprecation_warning() _log_api_usage_once(self) from .. import get_video_backend diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index 93d4ba45d65..2aad7bd4dc3 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -352,6 +352,8 @@ class JPEG(Transform): def __init__(self, quality: Union[int, Sequence[int]]): super().__init__() if isinstance(quality, int): + if isinstance(quality, bool): + raise TypeError("quality can't be bool") quality = [quality, quality] else: _check_sequence_input(quality, "quality", req_sizes=(2,)) diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py index 4dd7ba343aa..240330386fb 100644 --- a/torchvision/transforms/v2/_auto_augment.py +++ b/torchvision/transforms/v2/_auto_augment.py @@ -361,7 +361,8 @@ class RandAugment(_AutoAugmentBase): If img is PIL Image, it is expected to be in mode "L" or "RGB". Args: - num_ops (int, optional): Number of augmentation transformations to apply sequentially. + num_ops (int, optional): Number of augmentation transformations to apply sequentially, + must be non-negative integer. Default: 2. magnitude (int, optional): Magnitude for all the transformations. num_magnitude_bins (int, optional): The number of different magnitude values. interpolation (InterpolationMode, optional): Desired interpolation enum defined by @@ -407,6 +408,8 @@ def __init__( fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None, ) -> None: super().__init__(interpolation=interpolation, fill=fill) + if not isinstance(num_ops, int) or (num_ops < 0): + raise ValueError(f"num_ops should be a non-negative integer, but got {num_ops} instead.") self.num_ops = num_ops self.magnitude = magnitude self.num_magnitude_bins = num_magnitude_bins diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py index dd65ca4d9c9..92d5bc1a2ca 100644 --- a/torchvision/transforms/v2/_utils.py +++ b/torchvision/transforms/v2/_utils.py @@ -81,11 +81,13 @@ def _get_fill(fill_dict, inpt_type): def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None: - if not isinstance(padding, (numbers.Number, tuple, list)): - raise TypeError("Got inappropriate padding arg") - if isinstance(padding, (tuple, list)) and len(padding) not in [1, 2, 4]: - raise ValueError(f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple") + err_msg = f"Padding must be an int or a 1, 2, or 4 element of tuple or list, got {padding}." + if isinstance(padding, (tuple, list)): + if len(padding) not in [1, 2, 4] or not all(isinstance(p, int) for p in padding): + raise ValueError(err_msg) + elif not isinstance(padding, int): + raise ValueError(err_msg) # TODO: let's use torchvision._utils.StrEnum to have the best of both worlds (strings and enums) diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py index eb75f58cb7a..a3f187f84cf 100644 --- a/torchvision/transforms/v2/functional/_color.py +++ b/torchvision/transforms/v2/functional/_color.py @@ -460,6 +460,9 @@ def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor: @_register_kernel_internal(posterize, torch.Tensor) @_register_kernel_internal(posterize, tv_tensors.Image) def posterize_image(image: torch.Tensor, bits: int) -> torch.Tensor: + if not isinstance(bits, int) or not 0 <= bits <= 8: + raise TypeError(f"bits must be a positive integer in the range [0, 8], got {bits} instead.") + if image.is_floating_point(): levels = 1 << bits return image.mul(levels).floor_().clamp_(0, levels - 1).mul_(1.0 / levels)