Merge branch 'random_crop' into refactor_decoder_transforms

scotts · scotts · commit 501776091a86 · 2025-12-03T11:17:30.000-08:00
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -545,12 +545,14 @@ void SingleStreamDecoder::addVideoStream(
 
   metadataDims_ =
       FrameDims(streamMetadata.height.value(), streamMetadata.width.value());
+  FrameDims currInputDims = metadataDims_;
   for (auto& transform : transforms) {
     TORCH_CHECK(transform != nullptr, "Transforms should never be nullptr!");
     if (transform->getOutputFrameDims().has_value()) {
       resizedOutputDims_ = transform->getOutputFrameDims().value();
     }
-    transform->validate(streamMetadata);
+    transform->validate(currInputDims);
+    currInputDims = resizedOutputDims_.value_or(metadataDims_);
 
     // Note that we are claiming ownership of the transform objects passed in to
     // us.
diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
@@ -53,15 +53,45 @@ std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
   return outputDims_;
 }
 
-void CropTransform::validate(const StreamMetadata& streamMetadata) const {
-  TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds");
+void CropTransform::validate(const FrameDims& inputDims) const {
   TORCH_CHECK(
-      x_ + outputDims_.width <= streamMetadata.width,
-      "Crop x position out of bounds")
-  TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds");
+      outputDims_.height <= inputDims.height,
+      "Crop output height (",
+      outputDims_.height,
+      ") is greater than input height (",
+      inputDims.height,
+      ")");
   TORCH_CHECK(
-      y_ + outputDims_.height <= streamMetadata.height,
-      "Crop y position out of bounds");
+      outputDims_.width <= inputDims.width,
+      "Crop output width (",
+      outputDims_.width,
+      ") is greater than input width (",
+      inputDims.width,
+      ")");
+  TORCH_CHECK(
+      x_ <= inputDims.width,
+      "Crop x start position, ",
+      x_,
+      ", out of bounds of input width, ",
+      inputDims.width);
+  TORCH_CHECK(
+      x_ + outputDims_.width <= inputDims.width,
+      "Crop x end position, ",
+      x_ + outputDims_.width,
+      ", out of bounds of input width ",
+      inputDims.width);
+  TORCH_CHECK(
+      y_ <= inputDims.height,
+      "Crop y start position, ",
+      y_,
+      ", out of bounds of input height, ",
+      inputDims.height);
+  TORCH_CHECK(
+      y_ + outputDims_.height <= inputDims.height,
+      "Crop y end position, ",
+      y_ + outputDims_.height,
+      ", out of bounds of input height ",
+      inputDims.height);
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Transform.h b/src/torchcodec/_core/Transform.h
@@ -36,8 +36,7 @@ class Transform {
   //
   // Note that the validation function does not return anything. We expect
   // invalid configurations to throw an exception.
-  virtual void validate(
-      [[maybe_unused]] const StreamMetadata& streamMetadata) const {}
+  virtual void validate([[maybe_unused]] const FrameDims& inputDims) const {}
 };
 
 class ResizeTransform : public Transform {
@@ -64,7 +63,7 @@ class CropTransform : public Transform {
 
   std::string getFilterGraphCpu() const override;
   std::optional<FrameDims> getOutputFrameDims() const override;
-  void validate(const StreamMetadata& streamMetadata) const override;
+  void validate(const FrameDims& inputDims) const override;
 
  private:
   FrameDims outputDims_;
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -514,36 +514,35 @@ def _make_transform_specs(
     # dimensions from its input dimensions. We store these with the converted
     # transform, to be all used together when we generate the specs.
     converted_transforms: list[
-        Tuple[DecoderTransform, Tuple[Optional[int], Optional[int]]]
+        Tuple[
+            DecoderTransform,
+            # A (height, width) pair where the values may be missing.
+            Tuple[Optional[int], Optional[int]],
+        ]
     ] = []
     curr_input_dims = input_dims
     for transform in transforms:
-        if isinstance(transform, DecoderTransform):
-            output_dims = transform._calculate_output_dims(curr_input_dims)
-            converted_transforms.append((transform, curr_input_dims))
-        else:
+        if not isinstance(transform, DecoderTransform):
             if not tv_available:
                 raise ValueError(
                     f"The supplied transform, {transform}, is not a TorchCodec "
                     " DecoderTransform. TorchCodec also accepts TorchVision "
                     "v2 transforms, but TorchVision is not installed."
                 )
             elif isinstance(transform, v2.Resize):
-                tc_transform = Resize._from_torchvision(transform)
-                output_dims = tc_transform._calculate_output_dims(curr_input_dims)
-                converted_transforms.append((tc_transform, curr_input_dims))
+                transform = Resize._from_torchvision(transform)
             elif isinstance(transform, v2.RandomCrop):
-                tc_transform = RandomCrop._from_torchvision(transform)
-                output_dims = tc_transform._calculate_output_dims(curr_input_dims)
-                converted_transforms.append((tc_transform, curr_input_dims))
+                transform = RandomCrop._from_torchvision(transform)
             else:
                 raise ValueError(
                     f"Unsupported transform: {transform}. Transforms must be "
                     "either a TorchCodec DecoderTransform or a TorchVision "
                     "v2 transform."
                 )
 
-        curr_input_dims = output_dims
+        converted_transforms.append((transform, curr_input_dims))
+        output_dims = transform._get_output_dims()
+        curr_input_dims = output_dims if output_dims is not None else curr_input_dims
 
     return ";".join([t._make_transform_spec(dims) for t, dims in converted_transforms])
 
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
@@ -39,12 +39,44 @@ class DecoderTransform(ABC):
     def _make_transform_spec(
         self, input_dims: Tuple[Optional[int], Optional[int]]
     ) -> str:
+        """Makes the transform spec that is used by the `VideoDecoder`.
+
+        Args:
+            input_dims (Tuple[Optional[int], Optional[int]]): The dimensions of
+                the input frame in the form (height, width). We cannot know the
+                dimensions at object construction time because it's dependent on
+                the video being decoded and upstream transforms in the same
+                transform pipeline. Not all transforms need to know this; those
+                that don't will ignore it. The individual values in the tuple are
+                optional because the original values come from file metadata which
+                may be missing. We maintain the optionality throughout the APIs so
+                that we can decide as late as possible that it's necessary for the
+                values to exist. That is, if the values are missing from the
+                metadata and we have transforms which ignore the input dimensions,
+                we want that to still work.
+
+                Note: This method is the moral equivalent of TorchVision's
+                `Transform.make_params()`.
+
+        Returns:
+            str: A string which contains the spec for the transform that the
+                `VideoDecoder` knows what to do with.
+        """
         pass
 
-    def _calculate_output_dims(
-        self, input_dims: Tuple[Optional[int], Optional[int]]
-    ) -> Tuple[Optional[int], Optional[int]]:
-        return input_dims
+    def _get_output_dims(self) -> Optional[Tuple[Optional[int], Optional[int]]]:
+        """Get the dimensions of the output frame.
+
+        Transforms that change the frame dimensions need to override this
+        method. Transforms that don't change the frame dimensions can rely on
+        this default implementation.
+
+        Returns:
+            Optional[Tuple[Optional[int], Optional[int]]]: The output dimensions.
+                - None: The output dimensions are the same as the input dimensions.
+                - (int, int): The (height, width) of the output frame.
+        """
+        return None
 
 
 def import_torchvision_transforms_v2() -> ModuleType:
@@ -64,7 +96,7 @@ class Resize(DecoderTransform):
     Interpolation is always bilinear. Anti-aliasing is always on.
 
     Args:
-        size: (sequence of int): Desired output size. Must be a sequence of
+        size (Sequence[int]): Desired output size. Must be a sequence of
             the form (height, width).
     """
 
@@ -81,9 +113,7 @@ def _make_transform_spec(
     ) -> str:
         return f"resize, {self.size[0]}, {self.size[1]}"
 
-    def _calculate_output_dims(
-        self, input_dims: Tuple[Optional[int], Optional[int]]
-    ) -> Tuple[Optional[int], Optional[int]]:
+    def _get_output_dims(self) -> Optional[Tuple[Optional[int], Optional[int]]]:
         return (self.size[0], self.size[1])
 
     @classmethod
@@ -116,13 +146,13 @@ class RandomCrop(DecoderTransform):
     Complementary TorchVision transform: :class:`~torchvision.transforms.v2.RandomCrop`.
     Padding of all kinds is disabled. The random location within the frame is
     determined during the initialization of the
-    :class:~`torchcodec.decoders.VideoDecoder` object that owns this transform.
+    :class:`~torchcodec.decoders.VideoDecoder` object that owns this transform.
     As a consequence, each decoded frame in the video will be cropped at the
     same location. Videos with variable resolution may result in undefined
     behavior.
 
     Args:
-        size: (sequence of int): Desired output size. Must be a sequence of
+        size (Sequence[int]): Desired output size. Must be a sequence of
             the form (height, width).
     """
 
@@ -159,28 +189,11 @@ def _make_transform_spec(
             )
 
         top = int(torch.randint(0, height - self.size[0] + 1, size=()).item())
-        self._top = top
-
         left = int(torch.randint(0, width - self.size[1] + 1, size=()).item())
-        self._left = left
 
         return f"crop, {self.size[0]}, {self.size[1]}, {left}, {top}"
 
-    def _calculate_output_dims(
-        self, input_dims: Tuple[Optional[int], Optional[int]]
-    ) -> Tuple[Optional[int], Optional[int]]:
-        height, width = input_dims
-        if height is None:
-            raise ValueError(
-                "Video metadata has no height. "
-                "RandomCrop can only be used when input frame dimensions are known."
-            )
-        if width is None:
-            raise ValueError(
-                "Video metadata has no width. "
-                "RandomCrop can only be used when input frame dimensions are known."
-            )
-
+    def _get_output_dims(self) -> Optional[Tuple[Optional[int], Optional[int]]]:
         return (self.size[0], self.size[1])
 
     @classmethod
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py
@@ -172,11 +172,15 @@ def test_random_crop_torchvision(
 
         # We want both kinds of RandomCrop objects to get arrive at the same
         # locations to crop, so we need to make sure they get the same random
-        # seed.
+        # seed. It's used in RandomCrop's _make_transform_spec() method, called
+        # by the VideoDecoder.
         torch.manual_seed(seed)
         tc_random_crop = torchcodec.transforms.RandomCrop(size=(height, width))
         decoder_random_crop = VideoDecoder(video.path, transforms=[tc_random_crop])
 
+        # Resetting manual seed for when TorchCodec's RandomCrop, created from
+        # the TorchVision RandomCrop, is used inside of the VideoDecoder. It
+        # needs to match the call above.
         torch.manual_seed(seed)
         decoder_random_crop_tv = VideoDecoder(
             video.path,
@@ -202,14 +206,11 @@ def test_random_crop_torchvision(
             expected_shape = (video.get_num_color_channels(), height, width)
             assert frame_random_crop_tv.shape == expected_shape
 
+            # Resetting manual seed to make sure the invocation of the
+            # TorchVision RandomCrop matches the two calls above.
+            torch.manual_seed(seed)
             frame_full = decoder_full[frame_index]
-            frame_tv = v2.functional.crop(
-                frame_full,
-                top=tc_random_crop._top,
-                left=tc_random_crop._left,
-                height=tc_random_crop.size[0],
-                width=tc_random_crop.size[1],
-            )
+            frame_tv = v2.RandomCrop(size=(height, width))(frame_full)
             assert_frames_equal(frame_random_crop, frame_tv)
 
     @pytest.mark.parametrize(
@@ -266,6 +267,56 @@ def test_crop_fails(self, error_message, params):
                 transforms=[v2.RandomCrop(**params)],
             )
 
+    @pytest.mark.parametrize("seed", [0, 314])
+    def test_random_crop_reusable_objects(self, seed):
+        torch.manual_seed(seed)
+        random_crop = torchcodec.transforms.RandomCrop(size=(99, 99))
+
+        # Create a spec which causes us to calculate the random crop location.
+        first_spec = random_crop._make_transform_spec((888, 888))
+
+        # Create a spec again, which should calculate a different random crop
+        # location. Despite having the same image size, the specs should be
+        # different because the crop should be at a different location
+        second_spec = random_crop._make_transform_spec((888, 888))
+        assert first_spec != second_spec
+
+        # Create a spec again, but with a different image size. The specs should
+        # obviously be different, but the original image size should not be in
+        # the spec at all.
+        third_spec = random_crop._make_transform_spec((777, 777))
+        assert third_spec != first_spec
+        assert "888" not in third_spec
+
+    @pytest.mark.parametrize(
+        "resize, random_crop",
+        [
+            (torchcodec.transforms.Resize, torchcodec.transforms.RandomCrop),
+            (v2.Resize, v2.RandomCrop),
+        ],
+    )
+    def test_transform_pipeline(self, resize, random_crop):
+        decoder = VideoDecoder(
+            TEST_SRC_2_720P.path,
+            transforms=[
+                # resized to bigger than original
+                resize(size=(2160, 3840)),
+                # crop to smaller than the resize, but still bigger than original
+                random_crop(size=(1080, 1920)),
+            ],
+        )
+
+        num_frames = len(decoder)
+        for frame_index in [
+            0,
+            int(num_frames * 0.25),
+            int(num_frames * 0.5),
+            int(num_frames * 0.75),
+            num_frames - 1,
+        ]:
+            frame = decoder[frame_index]
+            assert frame.shape == (TEST_SRC_2_720P.get_num_color_channels(), 1080, 1920)
+
     def test_transform_fails(self):
         with pytest.raises(
             ValueError,
@@ -528,14 +579,14 @@ def test_crop_transform_fails(self):
 
         with pytest.raises(
             RuntimeError,
-            match="x position out of bounds",
+            match="x start position, 9999, out of bounds",
         ):
             decoder = create_from_file(str(NASA_VIDEO.path))
             add_video_stream(decoder, transform_specs="crop, 100, 100, 9999, 100")
 
         with pytest.raises(
             RuntimeError,
-            match="y position out of bounds",
+            match=r"Crop output height \(999\) is greater than input height \(270\)",
         ):
             decoder = create_from_file(str(NASA_VIDEO.path))
             add_video_stream(decoder, transform_specs="crop, 999, 100, 100, 100")