diff --git a/docs/source/io.rst b/docs/source/io.rst index c3f2d658014..1bfd10d9600 100644 --- a/docs/source/io.rst +++ b/docs/source/io.rst @@ -86,17 +86,16 @@ IO operations read_file write_file -Video ------ +Video - DEPREACTED +------------------ .. warning:: - Torchvision supports video decoding through different APIs listed below, - some of which are still in BETA stage. In the near future, we intend to - centralize PyTorch's video decoding capabilities within the `torchcodec - `_ project. We encourage you to try - it out and share your feedback, as the torchvision video decoders will - eventually be deprecated. + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch .. autosummary:: :toctree: generated/ diff --git a/gallery/others/plot_video_api.py b/gallery/others/plot_video_api.py deleted file mode 100644 index 3a67e4d86d0..00000000000 --- a/gallery/others/plot_video_api.py +++ /dev/null @@ -1,346 +0,0 @@ -""" -========= -Video API -========= - -.. note:: - Try on `Colab `_ - or :ref:`go to the end ` to download the full example code. - -This example illustrates some of the APIs that torchvision offers for -videos, together with the examples on how to build datasets and more. -""" - -# %% -# 1. Introduction: building a new video object and examining the properties -# ------------------------------------------------------------------------- -# First we select a video to test the object out. For the sake of argument -# we're using one from kinetics400 dataset. -# To create it, we need to define the path and the stream we want to use. - -# %% -# Chosen video statistics: -# -# - WUzgd7C1pWA.mp4 -# - source: -# - kinetics-400 -# - video: -# - H-264 -# - MPEG-4 AVC (part 10) (avc1) -# - fps: 29.97 -# - audio: -# - MPEG AAC audio (mp4a) -# - sample rate: 48K Hz -# - -import torch -import torchvision -from torchvision.datasets.utils import download_url -torchvision.set_video_backend("video_reader") - -# Download the sample video -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - ".", - "WUzgd7C1pWA.mp4" -) -video_path = "./WUzgd7C1pWA.mp4" - -# %% -# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form -# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int. -# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered. -# Firstly, let's get the metadata for our particular video: - -stream = "video" -video = torchvision.io.VideoReader(video_path, stream) -video.get_metadata() - -# %% -# Here we can see that video has two streams - a video and an audio stream. -# Currently available stream types include ['video', 'audio']. -# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id -# (which are determined by video encoding). -# In this way, if the video container contains multiple streams of the same type, -# users can access the one they want. -# If only stream type is passed, the decoder auto-detects first stream of that type and returns it. - -# %% -# Let's read all the frames from the video stream. By default, the return value of -# ``next(video_reader)`` is a dict containing the following fields. -# -# The return fields are: -# -# - ``data``: containing a torch.tensor -# - ``pts``: containing a float timestamp of this particular frame - -metadata = video.get_metadata() -video.set_current_stream("audio") - -frames = [] # we are going to save the frames here. -ptss = [] # pts is a presentation timestamp in seconds (float) of each frame -for frame in video: - frames.append(frame['data']) - ptss.append(frame['pts']) - -print("PTS for first five frames ", ptss[:5]) -print("Total number of frames: ", len(frames)) -approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0] -print("Approx total number of datapoints we can expect: ", approx_nf) -print("Read data size: ", frames[0].size(0) * len(frames)) - -# %% -# But what if we only want to read certain time segment of the video? -# That can be done easily using the combination of our ``seek`` function, and the fact that each call -# to next returns the presentation timestamp of the returned frame in seconds. -# -# Given that our implementation relies on python iterators, -# we can leverage itertools to simplify the process and make it more pythonic. -# -# For example, if we wanted to read ten frames from second second: - - -import itertools -video.set_current_stream("video") - -frames = [] # we are going to save the frames here. - -# We seek into a second second of the video and use islice to get 10 frames since -for frame, pts in itertools.islice(video.seek(2), 10): - frames.append(frame) - -print("Total number of frames: ", len(frames)) - -# %% -# Or if we wanted to read from 2nd to 5th second, -# We seek into a second second of the video, -# then we utilize the itertools takewhile to get the -# correct number of frames: - -video.set_current_stream("video") -frames = [] # we are going to save the frames here. -video = video.seek(2) - -for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video): - frames.append(frame['data']) - -print("Total number of frames: ", len(frames)) -approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] -print("We can expect approx: ", approx_nf) -print("Tensor size: ", frames[0].size()) - -# %% -# 2. Building a sample read_video function -# ---------------------------------------------------------------------------------------- -# We can utilize the methods above to build the read video function that follows -# the same API to the existing ``read_video`` function. - - -def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True): - if end is None: - end = float("inf") - if end < start: - raise ValueError( - "end time should be larger than start time, got " - f"start time={start} and end time={end}" - ) - - video_frames = torch.empty(0) - video_pts = [] - if read_video: - video_object.set_current_stream("video") - frames = [] - for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): - frames.append(frame['data']) - video_pts.append(frame['pts']) - if len(frames) > 0: - video_frames = torch.stack(frames, 0) - - audio_frames = torch.empty(0) - audio_pts = [] - if read_audio: - video_object.set_current_stream("audio") - frames = [] - for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): - frames.append(frame['data']) - audio_pts.append(frame['pts']) - if len(frames) > 0: - audio_frames = torch.cat(frames, 0) - - return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata() - - -# Total number of frames should be 327 for video and 523264 datapoints for audio -vf, af, info, meta = example_read_video(video) -print(vf.size(), af.size()) - -# %% -# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) -# ------------------------------------------------------------------------------------------------------- -# Cool, so now we can use the same principle to make the sample dataset. -# We suggest trying out iterable dataset for this purpose. -# Here, we are going to build an example dataset that reads randomly selected 10 frames of video. - -# %% -# Make sample dataset -import os -os.makedirs("./dataset", exist_ok=True) -os.makedirs("./dataset/1", exist_ok=True) -os.makedirs("./dataset/2", exist_ok=True) - -# %% -# Download the videos -from torchvision.datasets.utils import download_url -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - "./dataset/1", "WUzgd7C1pWA.mp4" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true", - "./dataset/1", - "RATRACE_wave_f_nm_np1_fr_goo_37.avi" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true", - "./dataset/2", - "SOX5yA1l24A.mp4" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true", - "./dataset/2", - "v_SoccerJuggling_g23_c01.avi" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true", - "./dataset/2", - "v_SoccerJuggling_g24_c01.avi" -) - -# %% -# Housekeeping and utilities -import os -import random - -from torchvision.datasets.folder import make_dataset -from torchvision import transforms as t - - -def _find_classes(dir): - classes = [d.name for d in os.scandir(dir) if d.is_dir()] - classes.sort() - class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} - return classes, class_to_idx - - -def get_samples(root, extensions=(".mp4", ".avi")): - _, class_to_idx = _find_classes(root) - return make_dataset(root, class_to_idx, extensions=extensions) - -# %% -# We are going to define the dataset and some basic arguments. -# We assume the structure of the FolderDataset, and add the following parameters: -# -# - ``clip_len``: length of a clip in frames -# - ``frame_transform``: transform for every frame individually -# - ``video_transform``: transform on a video sequence -# -# .. note:: -# We actually add epoch size as using :func:`~torch.utils.data.IterableDataset` -# class allows us to naturally oversample clips or images from each video if needed. - - -class RandomDataset(torch.utils.data.IterableDataset): - def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16): - super(RandomDataset).__init__() - - self.samples = get_samples(root) - - # Allow for temporal jittering - if epoch_size is None: - epoch_size = len(self.samples) - self.epoch_size = epoch_size - - self.clip_len = clip_len - self.frame_transform = frame_transform - self.video_transform = video_transform - - def __iter__(self): - for i in range(self.epoch_size): - # Get random sample - path, target = random.choice(self.samples) - # Get video object - vid = torchvision.io.VideoReader(path, "video") - metadata = vid.get_metadata() - video_frames = [] # video frame buffer - - # Seek and return frames - max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0]) - start = random.uniform(0., max_seek) - for frame in itertools.islice(vid.seek(start), self.clip_len): - video_frames.append(self.frame_transform(frame['data'])) - current_pts = frame['pts'] - # Stack it into a tensor - video = torch.stack(video_frames, 0) - if self.video_transform: - video = self.video_transform(video) - output = { - 'path': path, - 'video': video, - 'target': target, - 'start': start, - 'end': current_pts} - yield output - -# %% -# Given a path of videos in a folder structure, i.e: -# -# - dataset -# - class 1 -# - file 0 -# - file 1 -# - ... -# - class 2 -# - file 0 -# - file 1 -# - ... -# - ... -# -# We can generate a dataloader and test the dataset. - - -transforms = [t.Resize((112, 112))] -frame_transform = t.Compose(transforms) - -dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) - -# %% -from torch.utils.data import DataLoader -loader = DataLoader(dataset, batch_size=12) -data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} -for batch in loader: - for i in range(len(batch['path'])): - data['video'].append(batch['path'][i]) - data['start'].append(batch['start'][i].item()) - data['end'].append(batch['end'][i].item()) - data['tensorsize'].append(batch['video'][i].size()) -print(data) - -# %% -# 4. Data Visualization -# ---------------------------------- -# Example of visualized video - -import matplotlib.pyplot as plt - -plt.figure(figsize=(12, 12)) -for i in range(16): - plt.subplot(4, 4, i + 1) - plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) - plt.axis("off") - -# %% -# Cleanup the video and dataset: -import os -import shutil -os.remove("./WUzgd7C1pWA.mp4") -shutil.rmtree("./dataset") diff --git a/torchvision/io/_video_deprecation_warning.py b/torchvision/io/_video_deprecation_warning.py new file mode 100644 index 00000000000..ea01d976110 --- /dev/null +++ b/torchvision/io/_video_deprecation_warning.py @@ -0,0 +1,12 @@ +import warnings + + +def _raise_video_deprecation_warning(): + warnings.warn( + "The video decoding and encoding capabilities of torchvision " + "are deprecated from version 0.22 and will be removed in version 0.24. " + "We recommend that you migrate to TorchCodec, where we'll consolidate " + "the future decoding/encoding capabilities of PyTorch: " + "https://github.com/pytorch/torchcodec", + UserWarning, + ) diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py index 69af045e773..7957ce3899a 100644 --- a/torchvision/io/_video_opt.py +++ b/torchvision/io/_video_opt.py @@ -6,6 +6,7 @@ import torch from ..extension import _load_library +from ._video_deprecation_warning import _raise_video_deprecation_warning try: @@ -185,6 +186,7 @@ def _read_video_from_file( info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int) """ + _raise_video_deprecation_warning() _validate_pts(video_pts_range) _validate_pts(audio_pts_range) @@ -256,6 +258,7 @@ def _probe_video_from_file(filename: str) -> VideoMetaData: """ Probe a video file and return VideoMetaData with info about the video """ + _raise_video_deprecation_warning() result = torch.ops.video_reader.probe_video_from_file(filename) vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration) @@ -331,6 +334,7 @@ def _read_video_from_memory( `K` is the number of channels """ + _raise_video_deprecation_warning() _validate_pts(video_pts_range) _validate_pts(audio_pts_range) @@ -405,6 +409,7 @@ def _read_video_timestamps_from_memory( 0, # audio_timebase_num 1, # audio_timebase_den ) + _raise_video_deprecation_warning() _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration) @@ -420,6 +425,7 @@ def _probe_video_from_memory( Probe a video in memory and return VideoMetaData with info about the video This function is torchscriptable """ + _raise_video_deprecation_warning() if not isinstance(video_data, torch.Tensor): with warnings.catch_warnings(): # Ignore the warning because we actually don't modify the buffer in this function @@ -437,6 +443,7 @@ def _read_video( end_pts: Optional[Union[float, Fraction]] = None, pts_unit: str = "pts", ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, float]]: + _raise_video_deprecation_warning() if end_pts is None: end_pts = float("inf") @@ -495,6 +502,7 @@ def get_pts(time_base): def _read_video_timestamps( filename: str, pts_unit: str = "pts" ) -> Tuple[Union[List[int], List[Fraction]], Optional[float]]: + _raise_video_deprecation_warning() if pts_unit == "pts": warnings.warn( "The pts_unit 'pts' gives wrong results and will be removed in a " diff --git a/torchvision/io/video.py b/torchvision/io/video.py index c2a8c889162..5e56848d817 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -11,6 +11,7 @@ from ..utils import _log_api_usage_once from . import _video_opt +from ._video_deprecation_warning import _raise_video_deprecation_warning try: import av @@ -66,21 +67,21 @@ def write_video( audio_options: Optional[Dict[str, Any]] = None, ) -> None: """ - Writes a 4d tensor in [T, H, W, C] format in a video file. + [DEPRECATED] Writes a 4d tensor in [T, H, W, C] format in a video file. + + .. warning:: + + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch This function relies on PyAV (therefore, ultimately FFmpeg) to encode videos, you can get more fine-grained control by referring to the other options at your disposal within `the FFMpeg wiki `_. - .. warning:: - - In the near future, we intend to centralize PyTorch's video decoding - capabilities within the `torchcodec - `_ project. We encourage you to - try it out and share your feedback, as the torchvision video decoders - will eventually be deprecated. - Args: filename (str): path where the video will be saved video_array (Tensor[T, H, W, C]): tensor containing the individual frames, @@ -107,6 +108,7 @@ def write_video( >>> write_video("video.mp4", options = {"crf": "17"}) """ + _raise_video_deprecation_warning() if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(write_video) _check_av_available() @@ -276,16 +278,15 @@ def read_video( pts_unit: str = "pts", output_format: str = "THWC", ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]: - """ - Reads a video from a file, returning both the video frames and the audio frames + """[DEPRECATED] Reads a video from a file, returning both the video frames and the audio frames .. warning:: - In the near future, we intend to centralize PyTorch's video decoding - capabilities within the `torchcodec - `_ project. We encourage you to - try it out and share your feedback, as the torchvision video decoders - will eventually be deprecated. + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch Args: filename (str): path to the video file. If using the pyav backend, this can be whatever ``av.open`` accepts. @@ -302,6 +303,7 @@ def read_video( aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int) """ + _raise_video_deprecation_warning() if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(read_video) @@ -408,16 +410,15 @@ def _decode_video_timestamps(container: "av.container.Container") -> List[int]: def read_video_timestamps(filename: str, pts_unit: str = "pts") -> Tuple[List[int], Optional[float]]: - """ - List the video frames timestamps. + """[DEPREACTED] List the video frames timestamps. .. warning:: - In the near future, we intend to centralize PyTorch's video decoding - capabilities within the `torchcodec - `_ project. We encourage you to - try it out and share your feedback, as the torchvision video decoders - will eventually be deprecated. + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch Note that the function decodes the whole video frame-by-frame. @@ -432,6 +433,7 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> Tuple[List[in video_fps (float, optional): the frame rate for the video """ + _raise_video_deprecation_warning() if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(read_video_timestamps) from torchvision import get_video_backend diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py index cf319fe288e..5096b6ba324 100644 --- a/torchvision/io/video_reader.py +++ b/torchvision/io/video_reader.py @@ -6,6 +6,7 @@ import torch from ..utils import _log_api_usage_once +from ._video_deprecation_warning import _raise_video_deprecation_warning from ._video_opt import _HAS_CPU_VIDEO_DECODER @@ -45,8 +46,7 @@ def _has_video_opt() -> bool: class VideoReader: - """ - Fine-grained video-reading API. + """[DEPRECATED] Fine-grained video-reading API. Supports frame-by-frame reading of various streams from a single video container. Much like previous video_reader API it supports the following backends: video_reader, pyav, and cuda. @@ -54,11 +54,11 @@ class VideoReader: .. warning:: - In the near future, we intend to centralize PyTorch's video decoding - capabilities within the `torchcodec - `_ project. We encourage you to - try it out and share your feedback, as the torchvision video decoders - will eventually be deprecated. + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch .. betastatus:: VideoReader class @@ -125,6 +125,7 @@ def __init__( stream: str = "video", num_threads: int = 0, ) -> None: + _raise_video_deprecation_warning() _log_api_usage_once(self) from .. import get_video_backend