diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 3c6048187..9c7b8ac7f 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -239,6 +239,19 @@ int checkedToPositiveInt(const std::string& str) { return ret; } +int checkedToNonNegativeInt(const std::string& str) { + int ret = 0; + try { + ret = std::stoi(str); + } catch (const std::invalid_argument&) { + TORCH_CHECK(false, "String cannot be converted to an int:" + str); + } catch (const std::out_of_range&) { + TORCH_CHECK(false, "String would become integer out of range:" + str); + } + TORCH_CHECK(ret >= 0, "String must be a non-negative integer:" + str); + return ret; +} + // Resize transform specs take the form: // // "resize, , " @@ -270,8 +283,8 @@ Transform* makeCropTransform( "cropTransformSpec must have 5 elements including its name"); int height = checkedToPositiveInt(cropTransformSpec[1]); int width = checkedToPositiveInt(cropTransformSpec[2]); - int x = checkedToPositiveInt(cropTransformSpec[3]); - int y = checkedToPositiveInt(cropTransformSpec[4]); + int x = checkedToNonNegativeInt(cropTransformSpec[3]); + int y = checkedToNonNegativeInt(cropTransformSpec[4]); return new CropTransform(FrameDims(height, width), x, y); } diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py index 1b4d4706d..4a249d1a7 100644 --- a/src/torchcodec/decoders/_video_decoder.py +++ b/src/torchcodec/decoders/_video_decoder.py @@ -19,7 +19,7 @@ create_decoder, ERROR_REPORTING_INSTRUCTIONS, ) -from torchcodec.transforms import DecoderTransform, Resize +from torchcodec.transforms import DecoderTransform, RandomCrop, Resize class VideoDecoder: @@ -167,7 +167,11 @@ def __init__( device = str(device) device_variant = _get_cuda_backend() - transform_specs = _make_transform_specs(transforms) + transform_specs = _make_transform_specs( + transforms, + input_dims=(self.metadata.height, self.metadata.width), + dimension_order=dimension_order, + ) core.add_video_stream( self._decoder, @@ -450,6 +454,8 @@ def _get_and_validate_stream_metadata( def _convert_to_decoder_transforms( transforms: Sequence[Union[DecoderTransform, nn.Module]], + input_dims: Tuple[Optional[int], Optional[int]], + dimension_order: Literal["NCHW", "NHWC"], ) -> List[DecoderTransform]: """Convert a sequence of transforms that may contain TorchVision transform objects into a list of only TorchCodec transform objects. @@ -482,7 +488,22 @@ def _convert_to_decoder_transforms( "v2 transforms, but TorchVision is not installed." ) elif isinstance(transform, v2.Resize): - converted_transforms.append(Resize._from_torchvision(transform)) + transform_tc = Resize._from_torchvision(transform) + input_dims = transform_tc._get_output_dims(input_dims) + converted_transforms.append(transform_tc) + elif isinstance(transform, v2.RandomCrop): + if dimension_order != "NCHW": + raise ValueError( + "TorchVision v2 RandomCrop is only supported for NCHW " + "dimension order. Please use the TorchCodec RandomCrop " + "transform instead." + ) + transform_tc = RandomCrop._from_torchvision( + transform, + input_dims, + ) + input_dims = transform_tc._get_output_dims(input_dims) + converted_transforms.append(transform_tc) else: raise ValueError( f"Unsupported transform: {transform}. Transforms must be " @@ -490,6 +511,7 @@ def _convert_to_decoder_transforms( "v2 transform." ) else: + input_dims = transform._get_output_dims(input_dims) converted_transforms.append(transform) return converted_transforms @@ -497,6 +519,8 @@ def _convert_to_decoder_transforms( def _make_transform_specs( transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]], + input_dims: Tuple[Optional[int], Optional[int]], + dimension_order: Literal["NCHW", "NHWC"], ) -> str: """Given a sequence of transforms, turn those into the specification string the core API expects. @@ -516,7 +540,7 @@ def _make_transform_specs( if transforms is None: return "" - transforms = _convert_to_decoder_transforms(transforms) + transforms = _convert_to_decoder_transforms(transforms, input_dims, dimension_order) return ";".join([t._make_transform_spec() for t in transforms]) diff --git a/src/torchcodec/transforms/__init__.py b/src/torchcodec/transforms/__init__.py index 9f4a92f81..c93bad39e 100644 --- a/src/torchcodec/transforms/__init__.py +++ b/src/torchcodec/transforms/__init__.py @@ -4,4 +4,4 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from ._decoder_transforms import DecoderTransform, Resize # noqa +from ._decoder_transforms import DecoderTransform, RandomCrop, Resize # noqa diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py index dec4704b0..7ac45c1cd 100644 --- a/src/torchcodec/transforms/_decoder_transforms.py +++ b/src/torchcodec/transforms/_decoder_transforms.py @@ -7,8 +7,9 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from types import ModuleType -from typing import Sequence +from typing import Optional, Sequence, Tuple +import torch from torch import nn @@ -22,8 +23,8 @@ class DecoderTransform(ABC): decoded frames and applying the same kind of transform. Most ``DecoderTransform`` objects have a complementary transform in TorchVision, - specificially in `torchvision.transforms.v2 `_. For such transforms, we - ensure that: + specificially in `torchvision.transforms.v2 `_. + For such transforms, we ensure that: 1. The names are the same. 2. Default behaviors are the same. @@ -40,6 +41,11 @@ class DecoderTransform(ABC): def _make_transform_spec(self) -> str: pass + def _get_output_dims( + self, input_dims: Tuple[Optional[int], Optional[int]] + ) -> Tuple[Optional[int], Optional[int]]: + return input_dims + def import_torchvision_transforms_v2() -> ModuleType: try: @@ -66,28 +72,167 @@ class Resize(DecoderTransform): size: Sequence[int] def _make_transform_spec(self) -> str: + # TODO: establish this invariant in the constructor during refactor assert len(self.size) == 2 return f"resize, {self.size[0]}, {self.size[1]}" + def _get_output_dims( + self, input_dims: Tuple[Optional[int], Optional[int]] + ) -> Tuple[Optional[int], Optional[int]]: + # TODO: establish this invariant in the constructor during refactor + assert len(self.size) == 2 + return (self.size[0], self.size[1]) + @classmethod - def _from_torchvision(cls, resize_tv: nn.Module): + def _from_torchvision(cls, tv_resize: nn.Module): v2 = import_torchvision_transforms_v2() - assert isinstance(resize_tv, v2.Resize) + assert isinstance(tv_resize, v2.Resize) - if resize_tv.interpolation is not v2.InterpolationMode.BILINEAR: + if tv_resize.interpolation is not v2.InterpolationMode.BILINEAR: raise ValueError( "TorchVision Resize transform must use bilinear interpolation." ) - if resize_tv.antialias is False: + if tv_resize.antialias is False: raise ValueError( "TorchVision Resize transform must have antialias enabled." ) - if resize_tv.size is None: + if tv_resize.size is None: raise ValueError("TorchVision Resize transform must have a size specified.") - if len(resize_tv.size) != 2: + if len(tv_resize.size) != 2: raise ValueError( "TorchVision Resize transform must have a (height, width) " - f"pair for the size, got {resize_tv.size}." + f"pair for the size, got {tv_resize.size}." + ) + return cls(size=tv_resize.size) + + +@dataclass +class RandomCrop(DecoderTransform): + """Crop the decoded frame to a given size at a random location in the frame. + + Complementary TorchVision transform: :class:`~torchvision.transforms.v2.RandomCrop`. + Padding of all kinds is disabled. The random location within the frame is + determined during the initialization of the + :class:~`torchcodec.decoders.VideoDecoder` object that owns this transform. + As a consequence, each decoded frame in the video will be cropped at the + same location. Videos with variable resolution may result in undefined + behavior. + + Args: + size: (sequence of int): Desired output size. Must be a sequence of + the form (height, width). + """ + + size: Sequence[int] + _top: Optional[int] = None + _left: Optional[int] = None + _input_dims: Optional[Tuple[int, int]] = None + + def _make_transform_spec(self) -> str: + if len(self.size) != 2: + raise ValueError( + f"RandomCrop's size must be a sequence of length 2, got {self.size}. " + "This should never happen, please report a bug." + ) + + if self._top is None or self._left is None: + # TODO: It would be very strange if only ONE of those is None. But should we + # make it an error? We can continue, but it would probably mean + # something bad happened. Dear reviewer, please register an opinion here: + if self._input_dims is None: + raise ValueError( + "RandomCrop's input_dims must be set before calling _make_transform_spec(). " + "This should never happen, please report a bug." + ) + if self._input_dims[0] < self.size[0] or self._input_dims[1] < self.size[1]: + raise ValueError( + f"Input dimensions {self._input_dims} are smaller than the crop size {self.size}." + ) + + # Note: This logic must match the logic in + # torchvision.transforms.v2.RandomCrop.make_params(). Given + # the same seed, they should get the same result. This is an + # API guarantee with our users. + self._top = int( + torch.randint(0, self._input_dims[0] - self.size[0] + 1, size=()).item() + ) + self._left = int( + torch.randint(0, self._input_dims[1] - self.size[1] + 1, size=()).item() ) - return cls(size=resize_tv.size) + + return f"crop, {self.size[0]}, {self.size[1]}, {self._left}, {self._top}" + + def _get_output_dims( + self, input_dims: Tuple[Optional[int], Optional[int]] + ) -> Tuple[Optional[int], Optional[int]]: + # TODO: establish this invariant in the constructor during refactor + assert len(self.size) == 2 + + height, width = input_dims + if height is None: + raise ValueError( + "Video metadata has no height. RandomCrop can only be used when input frame dimensions are known." + ) + if width is None: + raise ValueError( + "Video metadata has no width. RandomCrop can only be used when input frame dimensions are known." + ) + + self._input_dims = (height, width) + return (self.size[0], self.size[1]) + + @classmethod + def _from_torchvision( + cls, + tv_random_crop: nn.Module, + input_dims: Tuple[Optional[int], Optional[int]], + ): + v2 = import_torchvision_transforms_v2() + + assert isinstance(tv_random_crop, v2.RandomCrop) + + if tv_random_crop.padding is not None: + raise ValueError( + "TorchVision RandomCrop transform must not specify padding." + ) + + if tv_random_crop.pad_if_needed is True: + raise ValueError( + "TorchVision RandomCrop transform must not specify pad_if_needed." + ) + + if tv_random_crop.fill != 0: + raise ValueError("TorchVision RandomCrop fill must be 0.") + + if tv_random_crop.padding_mode != "constant": + raise ValueError("TorchVision RandomCrop padding_mode must be constant.") + + if len(tv_random_crop.size) != 2: + raise ValueError( + "TorchVision RandcomCrop transform must have a (height, width) " + f"pair for the size, got {tv_random_crop.size}." + ) + + height, width = input_dims + if height is None: + raise ValueError( + "Video metadata has no height. RandomCrop can only be used when input frame dimensions are known." + ) + if width is None: + raise ValueError( + "Video metadata has no width. RandomCrop can only be used when input frame dimensions are known." + ) + + # Note that TorchVision v2 transforms only accept NCHW tensors. + params = tv_random_crop.make_params( + torch.empty(size=(3, height, width), dtype=torch.uint8) + ) + + if tv_random_crop.size != (params["height"], params["width"]): + raise ValueError( + f"TorchVision RandomCrop's provided size, {tv_random_crop.size} " + f"must match the computed size, {params['height'], params['width']}." + ) + + return cls(size=tv_random_crop.size, _top=params["top"], _left=params["left"]) diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py index bc42732ef..fd4a7de85 100644 --- a/test/test_transform_ops.py +++ b/test/test_transform_ops.py @@ -145,6 +145,131 @@ def test_resize_fails(self): ): VideoDecoder(NASA_VIDEO.path, transforms=[v2.Resize(size=(100))]) + @pytest.mark.parametrize( + "height_scaling_factor, width_scaling_factor", + ((0.5, 0.5), (0.25, 0.1), (1.0, 1.0), (0.25, 0.25)), + ) + @pytest.mark.parametrize("video", [NASA_VIDEO, TEST_SRC_2_720P]) + def test_random_crop_torchvision( + self, + height_scaling_factor, + width_scaling_factor, + video, + ): + height = int(video.get_height() * height_scaling_factor) + width = int(video.get_width() * width_scaling_factor) + + # We want both kinds of RandomCrop objects to get arrive at the same + # locations to crop, so we need to make sure they get the same random + # seed. + torch.manual_seed(0) + tc_random_crop = torchcodec.transforms.RandomCrop(size=(height, width)) + decoder_random_crop = VideoDecoder(video.path, transforms=[tc_random_crop]) + + torch.manual_seed(0) + decoder_random_crop_tv = VideoDecoder( + video.path, + transforms=[v2.RandomCrop(size=(height, width))], + ) + + decoder_full = VideoDecoder(video.path) + + num_frames = len(decoder_random_crop_tv) + assert num_frames == len(decoder_full) + + for frame_index in [ + 0, + int(num_frames * 0.1), + int(num_frames * 0.2), + int(num_frames * 0.3), + int(num_frames * 0.4), + int(num_frames * 0.5), + int(num_frames * 0.75), + int(num_frames * 0.90), + num_frames - 1, + ]: + frame_random_crop = decoder_random_crop[frame_index] + frame_random_crop_tv = decoder_random_crop_tv[frame_index] + assert_frames_equal(frame_random_crop, frame_random_crop_tv) + + expected_shape = (video.get_num_color_channels(), height, width) + assert frame_random_crop_tv.shape == expected_shape + + frame_full = decoder_full[frame_index] + frame_tv = v2.functional.crop( + frame_full, + top=tc_random_crop._top, + left=tc_random_crop._left, + height=tc_random_crop.size[0], + width=tc_random_crop.size[1], + ) + assert_frames_equal(frame_random_crop, frame_tv) + + @pytest.mark.parametrize( + "height_scaling_factor, width_scaling_factor", + ((0.25, 0.1), (0.25, 0.25)), + ) + def test_random_crop_nhwc( + self, + height_scaling_factor, + width_scaling_factor, + ): + height = int(TEST_SRC_2_720P.get_height() * height_scaling_factor) + width = int(TEST_SRC_2_720P.get_width() * width_scaling_factor) + + decoder = VideoDecoder( + TEST_SRC_2_720P.path, + transforms=[torchcodec.transforms.RandomCrop(size=(height, width))], + dimension_order="NHWC", + ) + + num_frames = len(decoder) + for frame_index in [ + 0, + int(num_frames * 0.25), + int(num_frames * 0.5), + int(num_frames * 0.75), + num_frames - 1, + ]: + frame = decoder[frame_index] + assert frame.shape == (height, width, 3) + + @pytest.mark.parametrize( + "error_message, params", + ( + ("must not specify padding", dict(size=(100, 100), padding=255)), + ( + "must not specify pad_if_needed", + dict(size=(100, 100), pad_if_needed=True), + ), + ("fill must be 0", dict(size=(100, 100), fill=255)), + ( + "padding_mode must be constant", + dict(size=(100, 100), padding_mode="edge"), + ), + ), + ) + def test_crop_fails(self, error_message, params): + with pytest.raises( + ValueError, + match=error_message, + ): + VideoDecoder( + NASA_VIDEO.path, + transforms=[v2.RandomCrop(**params)], + ) + + def test_tv_random_crop_nhwc_fails(self): + with pytest.raises( + ValueError, + match="TorchVision v2 RandomCrop is only supported for NCHW", + ): + VideoDecoder( + NASA_VIDEO.path, + transforms=[v2.RandomCrop(size=(100, 100))], + dimension_order="NHWC", + ) + def test_transform_fails(self): with pytest.raises( ValueError,