Way more defensive programming

scotts · scotts · commit 8e6a8f2a7177 · 2025-11-21T18:37:58.000-08:00
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -168,7 +168,9 @@ def __init__(
 
         device_variant = _get_cuda_backend()
         transform_specs = _make_transform_specs(
-            transforms, input_dims=(self.metadata.height, self.metadata.width)
+            transforms,
+            input_dims=(self.metadata.height, self.metadata.width),
+            dimension_order=dimension_order,
         )
 
         core.add_video_stream(
@@ -452,7 +454,8 @@ def _get_and_validate_stream_metadata(
 
 def _convert_to_decoder_transforms(
     transforms: Sequence[Union[DecoderTransform, nn.Module]],
-    input_dims: Tuple[int, int],
+    input_dims: Tuple[Optional[int], Optional[int]],
+    dimension_order: Literal["NCHW", "NHWC"],
 ) -> List[DecoderTransform]:
     """Convert a sequence of transforms that may contain TorchVision transform
     objects into a list of only TorchCodec transform objects.
@@ -489,7 +492,16 @@ def _convert_to_decoder_transforms(
                 input_dims = transform_tc._get_output_dims(input_dims)
                 converted_transforms.append(transform_tc)
             elif isinstance(transform, v2.RandomCrop):
-                transform_tc = RandomCrop._from_torchvision(transform, input_dims)
+                if dimension_order != "NCHW":
+                    raise ValueError(
+                        "TorchVision v2 RandomCrop is only supported for NCHW "
+                        "dimension order. Please use the TorchCodec RandomCrop "
+                        "transform instead."
+                    )
+                transform_tc = RandomCrop._from_torchvision(
+                    transform,
+                    input_dims,
+                )
                 input_dims = transform_tc._get_output_dims(input_dims)
                 converted_transforms.append(transform_tc)
             else:
@@ -507,7 +519,8 @@ def _convert_to_decoder_transforms(
 
 def _make_transform_specs(
     transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]],
-    input_dims: Tuple[int, int],
+    input_dims: Tuple[Optional[int], Optional[int]],
+    dimension_order: Literal["NCHW", "NHWC"],
 ) -> str:
     """Given a sequence of transforms, turn those into the specification string
        the core API expects.
@@ -527,7 +540,7 @@ def _make_transform_specs(
     if transforms is None:
         return ""
 
-    transforms = _convert_to_decoder_transforms(transforms, input_dims)
+    transforms = _convert_to_decoder_transforms(transforms, input_dims, dimension_order)
     return ";".join([t._make_transform_spec() for t in transforms])
 
 
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
@@ -41,7 +41,9 @@ class DecoderTransform(ABC):
     def _make_transform_spec(self) -> str:
         pass
 
-    def _get_output_dims(self, input_dims: Tuple[int, int]) -> Tuple[int, int]:
+    def _get_output_dims(
+        self, input_dims: Tuple[Optional[int], Optional[int]]
+    ) -> Tuple[Optional[int], Optional[int]]:
         return input_dims
 
 
@@ -70,34 +72,39 @@ class Resize(DecoderTransform):
     size: Sequence[int]
 
     def _make_transform_spec(self) -> str:
+        # TODO: establish this invariant in the constructor during refactor
         assert len(self.size) == 2
         return f"resize, {self.size[0]}, {self.size[1]}"
 
-    def _get_output_dims(self, input_dims: Tuple[int, int]) -> Tuple[int, int]:
-        return (*self.size,)
+    def _get_output_dims(
+        self, input_dims: Tuple[Optional[int], Optional[int]]
+    ) -> Tuple[Optional[int], Optional[int]]:
+        # TODO: establish this invariant in the constructor during refactor
+        assert len(self.size) == 2
+        return (self.size[0], self.size[1])
 
     @classmethod
-    def _from_torchvision(cls, resize_tv: nn.Module):
+    def _from_torchvision(cls, tv_resize: nn.Module):
         v2 = import_torchvision_transforms_v2()
 
-        assert isinstance(resize_tv, v2.Resize)
+        assert isinstance(tv_resize, v2.Resize)
 
-        if resize_tv.interpolation is not v2.InterpolationMode.BILINEAR:
+        if tv_resize.interpolation is not v2.InterpolationMode.BILINEAR:
             raise ValueError(
                 "TorchVision Resize transform must use bilinear interpolation."
             )
-        if resize_tv.antialias is False:
+        if tv_resize.antialias is False:
             raise ValueError(
                 "TorchVision Resize transform must have antialias enabled."
             )
-        if resize_tv.size is None:
+        if tv_resize.size is None:
             raise ValueError("TorchVision Resize transform must have a size specified.")
-        if len(resize_tv.size) != 2:
+        if len(tv_resize.size) != 2:
             raise ValueError(
                 "TorchVision Resize transform must have a (height, width) "
-                f"pair for the size, got {resize_tv.size}."
+                f"pair for the size, got {tv_resize.size}."
             )
-        return cls(size=resize_tv.size)
+        return cls(size=tv_resize.size)
 
 
 @dataclass
@@ -140,52 +147,92 @@ def _make_transform_spec(self) -> str:
                 )
             if self._input_dims[0] < self.size[0] or self._input_dims[1] < self.size[1]:
                 raise ValueError(
-                    f"Input dimensions {input_dims} are smaller than the crop size {self.size}."
+                    f"Input dimensions {self._input_dims} are smaller than the crop size {self.size}."
                 )
 
             # Note: This logic must match the logic in
             #       torchvision.transforms.v2.RandomCrop.make_params(). Given
             #       the same seed, they should get the same result. This is an
             #       API guarantee with our users.
-            self._top = torch.randint(
-                0, self._input_dims[0] - self.size[0] + 1, size=()
+            self._top = int(
+                torch.randint(0, self._input_dims[0] - self.size[0] + 1, size=()).item()
             )
-            self._left = torch.randint(
-                0, self._input_dims[1] - self.size[1] + 1, size=()
+            self._left = int(
+                torch.randint(0, self._input_dims[1] - self.size[1] + 1, size=()).item()
             )
 
         return f"crop, {self.size[0]}, {self.size[1]}, {self._left}, {self._top}"
 
-    def _get_output_dims(self, input_dims: Tuple[int, int]) -> Tuple[int, int]:
-        self._input_dims = input_dims
-        return self.size
+    def _get_output_dims(
+        self, input_dims: Tuple[Optional[int], Optional[int]]
+    ) -> Tuple[Optional[int], Optional[int]]:
+        # TODO: establish this invariant in the constructor during refactor
+        assert len(self.size) == 2
+
+        height, width = input_dims
+        if height is None:
+            raise ValueError(
+                "Video metadata has no height. RandomCrop can only be used when input frame dimensions are known."
+            )
+        if width is None:
+            raise ValueError(
+                "Video metadata has no width. RandomCrop can only be used when input frame dimensions are known."
+            )
+
+        self._input_dims = (height, width)
+        return (self.size[0], self.size[1])
 
     @classmethod
-    def _from_torchvision(cls, random_crop_tv: nn.Module, input_dims: Tuple[int, int]):
+    def _from_torchvision(
+        cls,
+        tv_random_crop: nn.Module,
+        input_dims: Tuple[Optional[int], Optional[int]],
+    ):
         v2 = import_torchvision_transforms_v2()
 
-        assert isinstance(random_crop_tv, v2.RandomCrop)
+        assert isinstance(tv_random_crop, v2.RandomCrop)
 
-        if random_crop_tv.padding is not None:
+        if tv_random_crop.padding is not None:
             raise ValueError(
                 "TorchVision RandomCrop transform must not specify padding."
             )
-        if random_crop_tv.pad_if_needed is True:
+
+        if tv_random_crop.pad_if_needed is True:
             raise ValueError(
                 "TorchVision RandomCrop transform must not specify pad_if_needed."
             )
-        if random_crop_tv.fill != 0:
+
+        if tv_random_crop.fill != 0:
             raise ValueError("TorchVision RandomCrop fill must be 0.")
-        if random_crop_tv.padding_mode != "constant":
+
+        if tv_random_crop.padding_mode != "constant":
             raise ValueError("TorchVision RandomCrop padding_mode must be constant.")
-        if len(random_crop_tv.size) != 2:
+
+        if len(tv_random_crop.size) != 2:
             raise ValueError(
                 "TorchVision RandcomCrop transform must have a (height, width) "
-                f"pair for the size, got {random_crop_tv.size}."
+                f"pair for the size, got {tv_random_crop.size}."
+            )
+
+        height, width = input_dims
+        if height is None:
+            raise ValueError(
+                "Video metadata has no height. RandomCrop can only be used when input frame dimensions are known."
+            )
+        if width is None:
+            raise ValueError(
+                "Video metadata has no width. RandomCrop can only be used when input frame dimensions are known."
             )
-        params = random_crop_tv.make_params(
-            # TODO: deal with NCHW versus NHWC; video decoder knows
-            torch.empty(size=(3, *input_dims), dtype=torch.uint8)
+
+        # Note that TorchVision v2 transforms only accept NCHW tensors.
+        params = tv_random_crop.make_params(
+            torch.empty(size=(3, height, width), dtype=torch.uint8)
         )
-        assert random_crop_tv.size == (params["height"], params["width"])
-        return cls(size=random_crop_tv.size, _top=params["top"], _left=params["left"])
+
+        if tv_random_crop.size != (params["height"], params["width"]):
+            raise ValueError(
+                f"TorchVision RandomCrop's provided size, {tv_random_crop.size} "
+                f"must match the computed size, {params['height'], params['width']}."
+            )
+
+        return cls(size=tv_random_crop.size, _top=params["top"], _left=params["left"])
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py
@@ -151,7 +151,10 @@ def test_resize_fails(self):
     )
     @pytest.mark.parametrize("video", [NASA_VIDEO, TEST_SRC_2_720P])
     def test_random_crop_torchvision(
-        self, video, height_scaling_factor, width_scaling_factor
+        self,
+        height_scaling_factor,
+        width_scaling_factor,
+        video,
     ):
         height = int(video.get_height() * height_scaling_factor)
         width = int(video.get_width() * width_scaling_factor)
@@ -165,7 +168,8 @@ def test_random_crop_torchvision(
 
         torch.manual_seed(0)
         decoder_random_crop_tv = VideoDecoder(
-            video.path, transforms=[v2.RandomCrop(size=(height, width))]
+            video.path,
+            transforms=[v2.RandomCrop(size=(height, width))],
         )
 
         decoder_full = VideoDecoder(video.path)
@@ -201,61 +205,69 @@ def test_random_crop_torchvision(
             )
             assert_frames_equal(frame_random_crop, frame_tv)
 
-    def test_crop_fails(self):
-        with pytest.raises(
-            ValueError,
-            match="must not specify padding",
-        ):
-            VideoDecoder(
-                NASA_VIDEO.path,
-                transforms=[
-                    v2.RandomCrop(
-                        size=(100, 100),
-                        padding=255,
-                    )
-                ],
-            )
+    @pytest.mark.parametrize(
+        "height_scaling_factor, width_scaling_factor",
+        ((0.25, 0.1), (0.25, 0.25)),
+    )
+    def test_random_crop_nhwc(
+        self,
+        height_scaling_factor,
+        width_scaling_factor,
+    ):
+        height = int(TEST_SRC_2_720P.get_height() * height_scaling_factor)
+        width = int(TEST_SRC_2_720P.get_width() * width_scaling_factor)
 
-        with pytest.raises(
-            ValueError,
-            match="must not specify pad_if_needed",
-        ):
-            VideoDecoder(
-                NASA_VIDEO.path,
-                transforms=[
-                    v2.RandomCrop(
-                        size=(100, 100),
-                        pad_if_needed=True,
-                    )
-                ],
-            )
+        decoder = VideoDecoder(
+            TEST_SRC_2_720P.path,
+            transforms=[torchcodec.transforms.RandomCrop(size=(height, width))],
+            dimension_order="NHWC",
+        )
+
+        num_frames = len(decoder)
+        for frame_index in [
+            0,
+            int(num_frames * 0.25),
+            int(num_frames * 0.5),
+            int(num_frames * 0.75),
+            num_frames - 1,
+        ]:
+            frame = decoder[frame_index]
+            assert frame.shape == (height, width, 3)
 
+    @pytest.mark.parametrize(
+        "error_message, params",
+        (
+            ("must not specify padding", dict(size=(100, 100), padding=255)),
+            (
+                "must not specify pad_if_needed",
+                dict(size=(100, 100), pad_if_needed=True),
+            ),
+            ("fill must be 0", dict(size=(100, 100), fill=255)),
+            (
+                "padding_mode must be constant",
+                dict(size=(100, 100), padding_mode="edge"),
+            ),
+        ),
+    )
+    def test_crop_fails(self, error_message, params):
         with pytest.raises(
             ValueError,
-            match="fill must be 0",
+            match=error_message,
         ):
             VideoDecoder(
                 NASA_VIDEO.path,
-                transforms=[
-                    v2.RandomCrop(
-                        size=(100, 100),
-                        fill=255,
-                    )
-                ],
+                transforms=[v2.RandomCrop(**params)],
             )
 
+    def test_tv_random_crop_nhwc_fails(self):
         with pytest.raises(
             ValueError,
-            match="padding_mode must be constant",
+            match="TorchVision v2 RandomCrop is only supported for NCHW",
         ):
             VideoDecoder(
                 NASA_VIDEO.path,
-                transforms=[
-                    v2.RandomCrop(
-                        size=(100, 100),
-                        padding_mode="edge",
-                    )
-                ],
+                transforms=[v2.RandomCrop(size=(100, 100))],
+                dimension_order="NHWC",
             )
 
     def test_transform_fails(self):