diff --git a/docs/source/api_ref_transforms.rst b/docs/source/api_ref_transforms.rst deleted file mode 100644 index 04ef28ab9..000000000 --- a/docs/source/api_ref_transforms.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. _transforms: - -===================== -torchcodec.transforms -===================== - -.. currentmodule:: torchcodec.transforms - -For a tutorial, see: TODO_DECODER_TRANSFORMS_TUTORIAL. - -.. autosummary:: - :toctree: generated/ - :nosignatures: - :template: dataclass.rst - - DecoderTransform - Resize diff --git a/docs/source/index.rst b/docs/source/index.rst index 74e8d1298..2318adfd9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -133,4 +133,3 @@ Encoding api_ref_decoders api_ref_encoders api_ref_samplers - api_ref_transforms diff --git a/src/torchcodec/__init__.py b/src/torchcodec/__init__.py index 144d3a67f..079db9824 100644 --- a/src/torchcodec/__init__.py +++ b/src/torchcodec/__init__.py @@ -9,7 +9,7 @@ # Note: usort wants to put Frame and FrameBatch after decoders and samplers, # but that results in circular import. from ._frame import AudioSamples, Frame, FrameBatch # usort:skip # noqa -from . import decoders, encoders, samplers, transforms # noqa +from . import decoders, encoders, samplers # noqa try: # Note that version.py is generated during install. diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py index 1b4d4706d..87d79b217 100644 --- a/src/torchcodec/decoders/_video_decoder.py +++ b/src/torchcodec/decoders/_video_decoder.py @@ -8,10 +8,10 @@ import json import numbers from pathlib import Path -from typing import List, Literal, Optional, Sequence, Tuple, Union +from typing import Literal, Optional, Tuple, Union import torch -from torch import device as torch_device, nn, Tensor +from torch import device as torch_device, Tensor from torchcodec import _core as core, Frame, FrameBatch from torchcodec.decoders._decoder_utils import ( @@ -19,7 +19,6 @@ create_decoder, ERROR_REPORTING_INSTRUCTIONS, ) -from torchcodec.transforms import DecoderTransform, Resize class VideoDecoder: @@ -68,11 +67,6 @@ class VideoDecoder: probably is. Default: "exact". Read more about this parameter in: :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py` - transforms (sequence of transform objects, optional): Sequence of transforms to be - applied to the decoded frames by the decoder itself, in order. Accepts both - :class:`~torchcodec.transforms.DecoderTransform` and - :class:`~torchvision.transforms.v2.Transform` - objects. Read more about this parameter in: TODO_DECODER_TRANSFORMS_TUTORIAL. custom_frame_mappings (str, bytes, or file-like object, optional): Mapping of frames to their metadata, typically generated via ffprobe. This enables accurate frame seeking without requiring a full video scan. @@ -111,7 +105,6 @@ def __init__( num_ffmpeg_threads: int = 1, device: Optional[Union[str, torch_device]] = None, seek_mode: Literal["exact", "approximate"] = "exact", - transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]] = None, custom_frame_mappings: Optional[ Union[str, bytes, io.RawIOBase, io.BufferedReader] ] = None, @@ -167,7 +160,6 @@ def __init__( device = str(device) device_variant = _get_cuda_backend() - transform_specs = _make_transform_specs(transforms) core.add_video_stream( self._decoder, @@ -176,7 +168,6 @@ def __init__( num_threads=num_ffmpeg_threads, device=device, device_variant=device_variant, - transform_specs=transform_specs, custom_frame_mappings=custom_frame_mappings_data, ) @@ -448,78 +439,6 @@ def _get_and_validate_stream_metadata( ) -def _convert_to_decoder_transforms( - transforms: Sequence[Union[DecoderTransform, nn.Module]], -) -> List[DecoderTransform]: - """Convert a sequence of transforms that may contain TorchVision transform - objects into a list of only TorchCodec transform objects. - - Args: - transforms: Squence of transform objects. The objects can be one of two - types: - 1. torchcodec.transforms.DecoderTransform - 2. torchvision.transforms.v2.Transform, but our type annotation - only mentions its base, nn.Module. We don't want to take a - hard dependency on TorchVision. - - Returns: - List of DecoderTransform objects. - """ - try: - from torchvision.transforms import v2 - - tv_available = True - except ImportError: - tv_available = False - - converted_transforms: list[DecoderTransform] = [] - for transform in transforms: - if not isinstance(transform, DecoderTransform): - if not tv_available: - raise ValueError( - f"The supplied transform, {transform}, is not a TorchCodec " - " DecoderTransform. TorchCodec also accept TorchVision " - "v2 transforms, but TorchVision is not installed." - ) - elif isinstance(transform, v2.Resize): - converted_transforms.append(Resize._from_torchvision(transform)) - else: - raise ValueError( - f"Unsupported transform: {transform}. Transforms must be " - "either a TorchCodec DecoderTransform or a TorchVision " - "v2 transform." - ) - else: - converted_transforms.append(transform) - - return converted_transforms - - -def _make_transform_specs( - transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]], -) -> str: - """Given a sequence of transforms, turn those into the specification string - the core API expects. - - Args: - transforms: Optional sequence of transform objects. The objects can be - one of two types: - 1. torchcodec.transforms.DecoderTransform - 2. torchvision.transforms.v2.Transform, but our type annotation - only mentions its base, nn.Module. We don't want to take a - hard dependency on TorchVision. - - Returns: - String of transforms in the format the core API expects: transform - specifications separate by semicolons. - """ - if transforms is None: - return "" - - transforms = _convert_to_decoder_transforms(transforms) - return ";".join([t._make_transform_spec() for t in transforms]) - - def _read_custom_frame_mappings( custom_frame_mappings: Union[str, bytes, io.RawIOBase, io.BufferedReader] ) -> tuple[Tensor, Tensor, Tensor]: diff --git a/src/torchcodec/transforms/__init__.py b/src/torchcodec/transforms/__init__.py deleted file mode 100644 index 9f4a92f81..000000000 --- a/src/torchcodec/transforms/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from ._decoder_transforms import DecoderTransform, Resize # noqa diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py deleted file mode 100644 index bc42732ef..000000000 --- a/test/test_transform_ops.py +++ /dev/null @@ -1,420 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import contextlib - -import json -import os -import subprocess - -import pytest - -import torch -import torchcodec - -from torchcodec._core import ( - _add_video_stream, - add_video_stream, - create_from_file, - get_frame_at_index, - get_json_metadata, -) -from torchcodec.decoders import VideoDecoder - -from torchvision.transforms import v2 - -from .utils import ( - assert_frames_equal, - assert_tensor_close_on_at_least, - AV1_VIDEO, - get_ffmpeg_major_version, - get_ffmpeg_minor_version, - H265_VIDEO, - NASA_VIDEO, - needs_cuda, - TEST_SRC_2_720P, -) - - -class TestPublicVideoDecoderTransformOps: - @pytest.mark.parametrize( - "height_scaling_factor, width_scaling_factor", - ((1.5, 1.31), (0.5, 0.71), (0.7, 1.31), (1.5, 0.71), (1.0, 1.0), (2.0, 2.0)), - ) - @pytest.mark.parametrize("video", [NASA_VIDEO, TEST_SRC_2_720P]) - def test_resize_torchvision( - self, video, height_scaling_factor, width_scaling_factor - ): - height = int(video.get_height() * height_scaling_factor) - width = int(video.get_width() * width_scaling_factor) - - # We're using both the TorchCodec object and the TorchVision object to - # ensure that they specify exactly the same thing. - decoder_resize = VideoDecoder( - video.path, transforms=[torchcodec.transforms.Resize(size=(height, width))] - ) - decoder_resize_tv = VideoDecoder( - video.path, transforms=[v2.Resize(size=(height, width))] - ) - - decoder_full = VideoDecoder(video.path) - - num_frames = len(decoder_resize) - assert num_frames == len(decoder_full) - - for frame_index in [ - 0, - int(num_frames * 0.1), - int(num_frames * 0.2), - int(num_frames * 0.3), - int(num_frames * 0.4), - int(num_frames * 0.5), - int(num_frames * 0.75), - int(num_frames * 0.90), - num_frames - 1, - ]: - frame_resize_tv = decoder_resize_tv[frame_index] - frame_resize = decoder_resize[frame_index] - assert_frames_equal(frame_resize_tv, frame_resize) - - frame_full = decoder_full[frame_index] - - frame_tv = v2.functional.resize(frame_full, size=(height, width)) - frame_tv_no_antialias = v2.functional.resize( - frame_full, size=(height, width), antialias=False - ) - - expected_shape = (video.get_num_color_channels(), height, width) - assert frame_resize.shape == expected_shape - assert frame_tv.shape == expected_shape - assert frame_tv_no_antialias.shape == expected_shape - - assert_tensor_close_on_at_least( - frame_resize, frame_tv, percentage=99.8, atol=1 - ) - torch.testing.assert_close(frame_resize, frame_tv, rtol=0, atol=6) - - if height_scaling_factor < 1 or width_scaling_factor < 1: - # Antialias only relevant when down-scaling! - with pytest.raises(AssertionError, match="Expected at least"): - assert_tensor_close_on_at_least( - frame_resize, frame_tv_no_antialias, percentage=99, atol=1 - ) - with pytest.raises(AssertionError, match="Tensor-likes are not close"): - torch.testing.assert_close( - frame_resize, frame_tv_no_antialias, rtol=0, atol=6 - ) - - def test_resize_fails(self): - with pytest.raises( - ValueError, - match=r"must use bilinear interpolation", - ): - VideoDecoder( - NASA_VIDEO.path, - transforms=[ - v2.Resize( - size=(100, 100), interpolation=v2.InterpolationMode.BICUBIC - ) - ], - ) - - with pytest.raises( - ValueError, - match=r"must have antialias enabled", - ): - VideoDecoder( - NASA_VIDEO.path, - transforms=[v2.Resize(size=(100, 100), antialias=False)], - ) - - with pytest.raises( - ValueError, - match=r"must have a size specified", - ): - VideoDecoder( - NASA_VIDEO.path, transforms=[v2.Resize(size=None, max_size=100)] - ) - - with pytest.raises( - ValueError, - match=r"must have a \(height, width\) pair for the size", - ): - VideoDecoder(NASA_VIDEO.path, transforms=[v2.Resize(size=(100))]) - - def test_transform_fails(self): - with pytest.raises( - ValueError, - match="Unsupported transform", - ): - VideoDecoder(NASA_VIDEO.path, transforms=[v2.RandomHorizontalFlip(p=1.0)]) - - -class TestCoreVideoDecoderTransformOps: - def get_num_frames_core_ops(self, video): - decoder = create_from_file(str(video.path)) - add_video_stream(decoder) - metadata = get_json_metadata(decoder) - metadata_dict = json.loads(metadata) - num_frames = metadata_dict["numFramesFromHeader"] - assert num_frames is not None - return num_frames - - @pytest.mark.parametrize("video", [NASA_VIDEO, H265_VIDEO, AV1_VIDEO]) - def test_color_conversion_library(self, video): - num_frames = self.get_num_frames_core_ops(video) - - filtergraph_decoder = create_from_file(str(video.path)) - _add_video_stream( - filtergraph_decoder, - color_conversion_library="filtergraph", - ) - - swscale_decoder = create_from_file(str(video.path)) - _add_video_stream( - swscale_decoder, - color_conversion_library="swscale", - ) - - for frame_index in [ - 0, - int(num_frames * 0.25), - int(num_frames * 0.5), - int(num_frames * 0.75), - num_frames - 1, - ]: - filtergraph_frame, *_ = get_frame_at_index( - filtergraph_decoder, frame_index=frame_index - ) - swscale_frame, *_ = get_frame_at_index( - swscale_decoder, frame_index=frame_index - ) - - assert_frames_equal(filtergraph_frame, swscale_frame) - - @pytest.mark.parametrize("width", [30, 32, 300]) - @pytest.mark.parametrize("height", [128]) - def test_color_conversion_library_with_generated_videos( - self, tmp_path, width, height - ): - # We consider filtergraph to be the reference color conversion library. - # However the video decoder sometimes uses swscale as that is faster. - # The exact color conversion library used is an implementation detail - # of the video decoder and depends on the video's width. - # - # In this test we compare the output of filtergraph (which is the - # reference) with the output of the video decoder (which may use - # swscale if it chooses for certain video widths) to make sure they are - # always the same. - video_path = f"{tmp_path}/frame_numbers_{width}x{height}.mp4" - # We don't specify a particular encoder because the ffmpeg binary could - # be configured with different encoders. For the purposes of this test, - # the actual encoder is irrelevant. - with contextlib.ExitStack() as stack: - ffmpeg_cli = "ffmpeg" - - if os.environ.get("IN_FBCODE_TORCHCODEC") == "1": - import importlib.resources - - ffmpeg_cli = stack.enter_context( - importlib.resources.path(__package__, "ffmpeg") - ) - - command = [ - ffmpeg_cli, - "-y", - "-f", - "lavfi", - "-i", - "color=blue", - "-pix_fmt", - "yuv420p", - "-s", - f"{width}x{height}", - "-frames:v", - "1", - video_path, - ] - subprocess.check_call(command) - - decoder = create_from_file(str(video_path)) - add_video_stream(decoder) - metadata = get_json_metadata(decoder) - metadata_dict = json.loads(metadata) - assert metadata_dict["width"] == width - assert metadata_dict["height"] == height - - num_frames = metadata_dict["numFramesFromHeader"] - assert num_frames is not None and num_frames == 1 - - filtergraph_decoder = create_from_file(str(video_path)) - _add_video_stream( - filtergraph_decoder, - color_conversion_library="filtergraph", - ) - - auto_decoder = create_from_file(str(video_path)) - add_video_stream( - auto_decoder, - ) - - filtergraph_frame0, *_ = get_frame_at_index(filtergraph_decoder, frame_index=0) - auto_frame0, *_ = get_frame_at_index(auto_decoder, frame_index=0) - assert_frames_equal(filtergraph_frame0, auto_frame0) - - @needs_cuda - def test_scaling_on_cuda_fails(self): - decoder = create_from_file(str(NASA_VIDEO.path)) - with pytest.raises( - RuntimeError, - match="Transforms are only supported for CPU devices.", - ): - add_video_stream(decoder, device="cuda", transform_specs="resize, 100, 100") - - def test_transform_fails(self): - decoder = create_from_file(str(NASA_VIDEO.path)) - with pytest.raises( - RuntimeError, - match="Invalid transform spec", - ): - add_video_stream(decoder, transform_specs=";") - - with pytest.raises( - RuntimeError, - match="Invalid transform name", - ): - add_video_stream(decoder, transform_specs="invalid, 1, 2") - - def test_resize_ffmpeg(self): - height = 135 - width = 240 - expected_shape = (NASA_VIDEO.get_num_color_channels(), height, width) - resize_spec = f"resize, {height}, {width}" - resize_filtergraph = f"scale={width}:{height}:flags=bilinear" - - decoder_resize = create_from_file(str(NASA_VIDEO.path)) - add_video_stream(decoder_resize, transform_specs=resize_spec) - - for frame_index in [17, 230, 389]: - frame_resize, *_ = get_frame_at_index( - decoder_resize, frame_index=frame_index - ) - frame_ref = NASA_VIDEO.get_frame_data_by_index( - frame_index, filters=resize_filtergraph - ) - - assert frame_resize.shape == expected_shape - assert frame_ref.shape == expected_shape - - if get_ffmpeg_major_version() <= 4 and get_ffmpeg_minor_version() <= 1: - # FFmpeg version 4.1 and before appear to have a different - # resize implementation. - torch.testing.assert_close(frame_resize, frame_ref, rtol=0, atol=2) - else: - assert_frames_equal(frame_resize, frame_ref) - - def test_resize_transform_fails(self): - decoder = create_from_file(str(NASA_VIDEO.path)) - with pytest.raises( - RuntimeError, - match="must have 3 elements", - ): - add_video_stream(decoder, transform_specs="resize, 100, 100, 100") - - with pytest.raises( - RuntimeError, - match="must be a positive integer", - ): - add_video_stream(decoder, transform_specs="resize, -10, 100") - - with pytest.raises( - RuntimeError, - match="must be a positive integer", - ): - add_video_stream(decoder, transform_specs="resize, 100, 0") - - with pytest.raises( - RuntimeError, - match="cannot be converted to an int", - ): - add_video_stream(decoder, transform_specs="resize, blah, 100") - - with pytest.raises( - RuntimeError, - match="out of range", - ): - add_video_stream(decoder, transform_specs="resize, 100, 1000000000000") - - def test_crop_transform(self): - # Note that filtergraph accepts dimensions as (w, h) and we accept them as (h, w). - width = 300 - height = 200 - x = 50 - y = 35 - crop_spec = f"crop, {height}, {width}, {x}, {y}" - crop_filtergraph = f"crop={width}:{height}:{x}:{y}:exact=1" - expected_shape = (NASA_VIDEO.get_num_color_channels(), height, width) - - decoder_crop = create_from_file(str(NASA_VIDEO.path)) - add_video_stream(decoder_crop, transform_specs=crop_spec) - - decoder_full = create_from_file(str(NASA_VIDEO.path)) - add_video_stream(decoder_full) - - for frame_index in [0, 15, 200, 389]: - frame_crop, *_ = get_frame_at_index(decoder_crop, frame_index=frame_index) - frame_ref = NASA_VIDEO.get_frame_data_by_index( - frame_index, filters=crop_filtergraph - ) - - frame_full, *_ = get_frame_at_index(decoder_full, frame_index=frame_index) - frame_tv = v2.functional.crop( - frame_full, top=y, left=x, height=height, width=width - ) - - assert frame_crop.shape == expected_shape - assert frame_ref.shape == expected_shape - assert frame_tv.shape == expected_shape - - assert_frames_equal(frame_crop, frame_ref) - assert_frames_equal(frame_crop, frame_tv) - - def test_crop_transform_fails(self): - - with pytest.raises( - RuntimeError, - match="must have 5 elements", - ): - decoder = create_from_file(str(NASA_VIDEO.path)) - add_video_stream(decoder, transform_specs="crop, 100, 100") - - with pytest.raises( - RuntimeError, - match="must be a positive integer", - ): - decoder = create_from_file(str(NASA_VIDEO.path)) - add_video_stream(decoder, transform_specs="crop, -10, 100, 100, 100") - - with pytest.raises( - RuntimeError, - match="cannot be converted to an int", - ): - decoder = create_from_file(str(NASA_VIDEO.path)) - add_video_stream(decoder, transform_specs="crop, 100, 100, blah, 100") - - with pytest.raises( - RuntimeError, - match="x position out of bounds", - ): - decoder = create_from_file(str(NASA_VIDEO.path)) - add_video_stream(decoder, transform_specs="crop, 100, 100, 9999, 100") - - with pytest.raises( - RuntimeError, - match="y position out of bounds", - ): - decoder = create_from_file(str(NASA_VIDEO.path)) - add_video_stream(decoder, transform_specs="crop, 999, 100, 100, 100")