Refactor all the things

scotts · scotts · commit d8b7ed003d0c · 2025-12-01T10:58:44.000-08:00
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -8,7 +8,7 @@
 import json
 import numbers
 from pathlib import Path
-from typing import List, Literal, Optional, Sequence, Tuple, Union
+from typing import Literal, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import device as torch_device, nn, Tensor
@@ -170,7 +170,6 @@ def __init__(
         transform_specs = _make_transform_specs(
             transforms,
             input_dims=(self.metadata.height, self.metadata.width),
-            dimension_order=dimension_order,
         )
 
         core.add_video_stream(
@@ -452,96 +451,97 @@ def _get_and_validate_stream_metadata(
     )
 
 
-def _convert_to_decoder_transforms(
-    transforms: Sequence[Union[DecoderTransform, nn.Module]],
+def _make_transform_specs(
+    transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]],
     input_dims: Tuple[Optional[int], Optional[int]],
-    dimension_order: Literal["NCHW", "NHWC"],
-) -> List[DecoderTransform]:
-    """Convert a sequence of transforms that may contain TorchVision transform
-    objects into a list of only TorchCodec transform objects.
+) -> str:
+    """Given a sequence of transforms, turn those into the specification string
+       the core API expects.
 
     Args:
-        transforms: Squence of transform objects. The objects can be one of two
-        types:
+        transforms: Optional sequence of transform objects. The objects can be
+            one of two types:
                 1. torchcodec.transforms.DecoderTransform
                 2. torchvision.transforms.v2.Transform, but our type annotation
                    only mentions its base, nn.Module. We don't want to take a
                    hard dependency on TorchVision.
+        input_dims: Optional (height, width) pair. Note that only some
+            transforms need to know the dimensions. If the user provides
+            transforms that don't need to know the dimensions, and that metadata
+            is missing, everything should still work. That means we assert their
+            existence as late as possible.
 
     Returns:
-        List of DecoderTransform objects.
+        String of transforms in the format the core API expects: transform
+        specifications separate by semicolons.
     """
+    if transforms is None:
+        return ""
+
     try:
         from torchvision.transforms import v2
 
         tv_available = True
     except ImportError:
         tv_available = False
 
-    converted_transforms: list[DecoderTransform] = []
+    # The following loop accomplishes two tasks:
+    #
+    #     1. Converts the transform to a DecoderTransform, if necessary. We
+    #        accept TorchVision transform objects and they must be converted
+    #        to their matching DecoderTransform.
+    #     2. Calculates what the input dimensions are to each transform.
+    #
+    # The order in our transforms list is semantically meaningful, as we
+    # actually have a pipeline where the output of one transform is the input to
+    # the next. For example, if we have the transforms list [A, B, C, D], then
+    # we should understand that as:
+    #     A -> B -> C -> D
+    # Where the frame produced by A is the input to B, the frame produced by B
+    # is the input to C, etc. This particularly matters for frame dimensions.
+    # Transforms can both:
+    #
+    #     1. Produce frames with arbitrary dimensions.
+    #     2. Rely on their input frame's dimensions to calculate ahead-of-time
+    #        what their runtime behavior will be.
+    #
+    # The consequence of the above facts is that we need to statically track
+    # frame dimensions in the pipeline while we pre-process it. The input
+    # frame's dimensions to A, our first transform, is always what we know from
+    # our metadata. For each transform, we always calculate its output
+    # dimensions from its input dimensions. We store these with the converted
+    # transform, to be all used together when we generate the specs.
+    converted_transforms: list[(DecoderTransform, Tuple[int, int])] = []
+    curr_input_dims = input_dims
     for transform in transforms:
-        if not isinstance(transform, DecoderTransform):
+        if isinstance(transform, DecoderTransform):
+            output_dims = transform._calculate_output_dims(curr_input_dims)
+            converted_transforms.append((transform, curr_input_dims))
+        else:
             if not tv_available:
                 raise ValueError(
                     f"The supplied transform, {transform}, is not a TorchCodec "
-                    " DecoderTransform. TorchCodec also accept TorchVision "
+                    " DecoderTransform. TorchCodec also accepts TorchVision "
                     "v2 transforms, but TorchVision is not installed."
                 )
             elif isinstance(transform, v2.Resize):
-                transform_tc = Resize._from_torchvision(transform)
-                input_dims = transform_tc._get_output_dims(input_dims)
-                converted_transforms.append(transform_tc)
+                tc_transform = Resize._from_torchvision(transform)
+                output_dims = tc_transform._calculate_output_dims(curr_input_dims)
+                converted_transforms.append((tc_transform, curr_input_dims))
             elif isinstance(transform, v2.RandomCrop):
-                if dimension_order != "NCHW":
-                    raise ValueError(
-                        "TorchVision v2 RandomCrop is only supported for NCHW "
-                        "dimension order. Please use the TorchCodec RandomCrop "
-                        "transform instead."
-                    )
-                transform_tc = RandomCrop._from_torchvision(
-                    transform,
-                    input_dims,
-                )
-                input_dims = transform_tc._get_output_dims(input_dims)
-                converted_transforms.append(transform_tc)
+                tc_transform = RandomCrop._from_torchvision(transform)
+                output_dims = tc_transform._calculate_output_dims(curr_input_dims)
+                converted_transforms.append((tc_transform, curr_input_dims))
             else:
                 raise ValueError(
                     f"Unsupported transform: {transform}. Transforms must be "
                     "either a TorchCodec DecoderTransform or a TorchVision "
                     "v2 transform."
                 )
-        else:
-            input_dims = transform._get_output_dims(input_dims)
-            converted_transforms.append(transform)
-
-    return converted_transforms
-
 
-def _make_transform_specs(
-    transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]],
-    input_dims: Tuple[Optional[int], Optional[int]],
-    dimension_order: Literal["NCHW", "NHWC"],
-) -> str:
-    """Given a sequence of transforms, turn those into the specification string
-       the core API expects.
-
-    Args:
-        transforms: Optional sequence of transform objects. The objects can be
-            one of two types:
-                1. torchcodec.transforms.DecoderTransform
-                2. torchvision.transforms.v2.Transform, but our type annotation
-                   only mentions its base, nn.Module. We don't want to take a
-                   hard dependency on TorchVision.
-
-    Returns:
-        String of transforms in the format the core API expects: transform
-        specifications separate by semicolons.
-    """
-    if transforms is None:
-        return ""
+        curr_input_dims = output_dims
 
-    transforms = _convert_to_decoder_transforms(transforms, input_dims, dimension_order)
-    return ";".join([t._make_transform_spec() for t in transforms])
+    return ";".join([t._make_transform_spec(dims) for t, dims in converted_transforms])
 
 
 def _read_custom_frame_mappings(
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
@@ -38,10 +38,10 @@ class DecoderTransform(ABC):
     """
 
     @abstractmethod
-    def _make_transform_spec(self) -> str:
+    def _make_transform_spec(self, input_dims: Tuple[int, int]) -> str:
         pass
 
-    def _get_output_dims(
+    def _calculate_output_dims(
         self, input_dims: Tuple[Optional[int], Optional[int]]
     ) -> Tuple[Optional[int], Optional[int]]:
         return input_dims
@@ -71,12 +71,12 @@ class Resize(DecoderTransform):
 
     size: Sequence[int]
 
-    def _make_transform_spec(self) -> str:
+    def _make_transform_spec(self, input_dims: Tuple[int, int]) -> str:
         # TODO: establish this invariant in the constructor during refactor
         assert len(self.size) == 2
         return f"resize, {self.size[0]}, {self.size[1]}"
 
-    def _get_output_dims(
+    def _calculate_output_dims(
         self, input_dims: Tuple[Optional[int], Optional[int]]
     ) -> Tuple[Optional[int], Optional[int]]:
         # TODO: establish this invariant in the constructor during refactor
@@ -125,45 +125,37 @@ class RandomCrop(DecoderTransform):
     """
 
     size: Sequence[int]
+
+    # Note that these values are never read by this object or the decoder. We
+    # record them for testing purposes only.
     _top: Optional[int] = None
     _left: Optional[int] = None
-    _input_dims: Optional[Tuple[int, int]] = None
 
-    def _make_transform_spec(self) -> str:
+    def _make_transform_spec(self, input_dims: Tuple[int, int]) -> str:
         if len(self.size) != 2:
             raise ValueError(
                 f"RandomCrop's size must be a sequence of length 2, got {self.size}. "
                 "This should never happen, please report a bug."
             )
 
-        if self._top is None or self._left is None:
-            # TODO: It would be very strange if only ONE of those is None. But should we
-            #       make it an error? We can continue, but it would probably mean
-            #       something bad happened. Dear reviewer, please register an opinion here:
-            if self._input_dims is None:
-                raise ValueError(
-                    "RandomCrop's input_dims must be set before calling _make_transform_spec(). "
-                    "This should never happen, please report a bug."
-                )
-            if self._input_dims[0] < self.size[0] or self._input_dims[1] < self.size[1]:
-                raise ValueError(
-                    f"Input dimensions {self._input_dims} are smaller than the crop size {self.size}."
-                )
-
-            # Note: This logic must match the logic in
-            #       torchvision.transforms.v2.RandomCrop.make_params(). Given
-            #       the same seed, they should get the same result. This is an
-            #       API guarantee with our users.
-            self._top = int(
-                torch.randint(0, self._input_dims[0] - self.size[0] + 1, size=()).item()
-            )
-            self._left = int(
-                torch.randint(0, self._input_dims[1] - self.size[1] + 1, size=()).item()
+        # Note: This logic below must match the logic in
+        #       torchvision.transforms.v2.RandomCrop.make_params(). Given
+        #       the same seed, they should get the same result. This is an
+        #       API guarantee with our users.
+        if input_dims[0] < self.size[0] or input_dims[1] < self.size[1]:
+            raise ValueError(
+                f"Input dimensions {input_dims} are smaller than the crop size {self.size}."
             )
 
-        return f"crop, {self.size[0]}, {self.size[1]}, {self._left}, {self._top}"
+        top = int(torch.randint(0, input_dims[0] - self.size[0] + 1, size=()).item())
+        self._top = top
+
+        left = int(torch.randint(0, input_dims[1] - self.size[1] + 1, size=()).item())
+        self._left = left
+
+        return f"crop, {self.size[0]}, {self.size[1]}, {left}, {top}"
 
-    def _get_output_dims(
+    def _calculate_output_dims(
         self, input_dims: Tuple[Optional[int], Optional[int]]
     ) -> Tuple[Optional[int], Optional[int]]:
         # TODO: establish this invariant in the constructor during refactor
@@ -172,25 +164,30 @@ def _get_output_dims(
         height, width = input_dims
         if height is None:
             raise ValueError(
-                "Video metadata has no height. RandomCrop can only be used when input frame dimensions are known."
+                "Video metadata has no height. "
+                "RandomCrop can only be used when input frame dimensions are known."
             )
         if width is None:
             raise ValueError(
-                "Video metadata has no width. RandomCrop can only be used when input frame dimensions are known."
+                "Video metadata has no width. "
+                "RandomCrop can only be used when input frame dimensions are known."
             )
 
-        self._input_dims = (height, width)
         return (self.size[0], self.size[1])
 
     @classmethod
     def _from_torchvision(
         cls,
         tv_random_crop: nn.Module,
-        input_dims: Tuple[Optional[int], Optional[int]],
     ):
         v2 = import_torchvision_transforms_v2()
 
-        assert isinstance(tv_random_crop, v2.RandomCrop)
+        if not isinstance(tv_random_crop, v2.RandomCrop):
+            raise ValueError(
+                "Transform must be TorchVision's RandomCrop, "
+                f"it is instead {type(tv_random_crop).__name__}. "
+                "This should never happen, please report a bug."
+            )
 
         if tv_random_crop.padding is not None:
             raise ValueError(
@@ -214,25 +211,4 @@ def _from_torchvision(
                 f"pair for the size, got {tv_random_crop.size}."
             )
 
-        height, width = input_dims
-        if height is None:
-            raise ValueError(
-                "Video metadata has no height. RandomCrop can only be used when input frame dimensions are known."
-            )
-        if width is None:
-            raise ValueError(
-                "Video metadata has no width. RandomCrop can only be used when input frame dimensions are known."
-            )
-
-        # Note that TorchVision v2 transforms only accept NCHW tensors.
-        params = tv_random_crop.make_params(
-            torch.empty(size=(3, height, width), dtype=torch.uint8)
-        )
-
-        if tv_random_crop.size != (params["height"], params["width"]):
-            raise ValueError(
-                f"TorchVision RandomCrop's provided size, {tv_random_crop.size} "
-                f"must match the computed size, {params['height'], params['width']}."
-            )
-
-        return cls(size=tv_random_crop.size, _top=params["top"], _left=params["left"])
+        return cls(size=tv_random_crop.size)
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py
@@ -147,26 +147,28 @@ def test_resize_fails(self):
 
     @pytest.mark.parametrize(
         "height_scaling_factor, width_scaling_factor",
-        ((0.5, 0.5), (0.25, 0.1), (1.0, 1.0), (0.25, 0.25)),
+        ((0.5, 0.5), (0.25, 0.1), (1.0, 1.0), (0.15, 0.75)),
     )
     @pytest.mark.parametrize("video", [NASA_VIDEO, TEST_SRC_2_720P])
+    @pytest.mark.parametrize("seed", [0, 1234])
     def test_random_crop_torchvision(
         self,
         height_scaling_factor,
         width_scaling_factor,
         video,
+        seed,
     ):
         height = int(video.get_height() * height_scaling_factor)
         width = int(video.get_width() * width_scaling_factor)
 
         # We want both kinds of RandomCrop objects to get arrive at the same
         # locations to crop, so we need to make sure they get the same random
         # seed.
-        torch.manual_seed(0)
+        torch.manual_seed(seed)
         tc_random_crop = torchcodec.transforms.RandomCrop(size=(height, width))
         decoder_random_crop = VideoDecoder(video.path, transforms=[tc_random_crop])
 
-        torch.manual_seed(0)
+        torch.manual_seed(seed)
         decoder_random_crop_tv = VideoDecoder(
             video.path,
             transforms=[v2.RandomCrop(size=(height, width))],
@@ -179,13 +181,9 @@ def test_random_crop_torchvision(
 
         for frame_index in [
             0,
-            int(num_frames * 0.1),
-            int(num_frames * 0.2),
-            int(num_frames * 0.3),
-            int(num_frames * 0.4),
+            int(num_frames * 0.25),
             int(num_frames * 0.5),
             int(num_frames * 0.75),
-            int(num_frames * 0.90),
             num_frames - 1,
         ]:
             frame_random_crop = decoder_random_crop[frame_index]
@@ -259,17 +257,6 @@ def test_crop_fails(self, error_message, params):
                 transforms=[v2.RandomCrop(**params)],
             )
 
-    def test_tv_random_crop_nhwc_fails(self):
-        with pytest.raises(
-            ValueError,
-            match="TorchVision v2 RandomCrop is only supported for NCHW",
-        ):
-            VideoDecoder(
-                NASA_VIDEO.path,
-                transforms=[v2.RandomCrop(size=(100, 100))],
-                dimension_order="NHWC",
-            )
-
     def test_transform_fails(self):
         with pytest.raises(
             ValueError,