Simplify; handle pipelines

scotts · scotts · commit f8844f48d421 · 2025-12-02T20:39:13.000-08:00
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -545,12 +545,14 @@ void SingleStreamDecoder::addVideoStream(
 
   metadataDims_ =
       FrameDims(streamMetadata.height.value(), streamMetadata.width.value());
+  FrameDims currInputDims = metadataDims_;
   for (auto& transform : transforms) {
     TORCH_CHECK(transform != nullptr, "Transforms should never be nullptr!");
     if (transform->getOutputFrameDims().has_value()) {
       resizedOutputDims_ = transform->getOutputFrameDims().value();
     }
-    transform->validate(streamMetadata);
+    transform->validate(currInputDims);
+    currInputDims = resizedOutputDims_.value_or(metadataDims_);
 
     // Note that we are claiming ownership of the transform objects passed in to
     // us.
diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
@@ -53,15 +53,45 @@ std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
   return outputDims_;
 }
 
-void CropTransform::validate(const StreamMetadata& streamMetadata) const {
-  TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds");
+void CropTransform::validate(const FrameDims& inputDims) const {
   TORCH_CHECK(
-      x_ + outputDims_.width <= streamMetadata.width,
-      "Crop x position out of bounds")
-  TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds");
+      outputDims_.height <= inputDims.height,
+      "Crop output height (",
+      outputDims_.height,
+      ") is greater than input height (",
+      inputDims.height,
+      ")");
   TORCH_CHECK(
-      y_ + outputDims_.height <= streamMetadata.height,
-      "Crop y position out of bounds");
+      outputDims_.width <= inputDims.width,
+      "Crop output width (",
+      outputDims_.width,
+      ") is greater than input width (",
+      inputDims.width,
+      ")");
+  TORCH_CHECK(
+      x_ <= inputDims.width,
+      "Crop x start position, ",
+      x_,
+      ", out of bounds of input width, ",
+      inputDims.width);
+  TORCH_CHECK(
+      x_ + outputDims_.width <= inputDims.width,
+      "Crop x end position, ",
+      x_ + outputDims_.width,
+      ", out of bounds of input width ",
+      inputDims.width);
+  TORCH_CHECK(
+      y_ <= inputDims.height,
+      "Crop y start position, ",
+      y_,
+      ", out of bounds of input height, ",
+      inputDims.height);
+  TORCH_CHECK(
+      y_ + outputDims_.height <= inputDims.height,
+      "Crop y end position, ",
+      y_ + outputDims_.height,
+      ", out of bounds of input height ",
+      inputDims.height);
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Transform.h b/src/torchcodec/_core/Transform.h
@@ -36,8 +36,7 @@ class Transform {
   //
   // Note that the validation function does not return anything. We expect
   // invalid configurations to throw an exception.
-  virtual void validate(
-      [[maybe_unused]] const StreamMetadata& streamMetadata) const {}
+  virtual void validate([[maybe_unused]] const FrameDims& inputDims) const {}
 };
 
 class ResizeTransform : public Transform {
@@ -64,7 +63,7 @@ class CropTransform : public Transform {
 
   std::string getFilterGraphCpu() const override;
   std::optional<FrameDims> getOutputFrameDims() const override;
-  void validate(const StreamMetadata& streamMetadata) const override;
+  void validate(const FrameDims& inputDims) const override;
 
  private:
   FrameDims outputDims_;
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -519,7 +519,7 @@ def _make_transform_specs(
     curr_input_dims = input_dims
     for transform in transforms:
         if isinstance(transform, DecoderTransform):
-            output_dims = transform._calculate_output_dims(curr_input_dims)
+            output_dims = transform._get_output_dims()
             converted_transforms.append((transform, curr_input_dims))
         else:
             if not tv_available:
@@ -530,11 +530,11 @@ def _make_transform_specs(
                 )
             elif isinstance(transform, v2.Resize):
                 tc_transform = Resize._from_torchvision(transform)
-                output_dims = tc_transform._calculate_output_dims(curr_input_dims)
+                output_dims = tc_transform._get_output_dims()
                 converted_transforms.append((tc_transform, curr_input_dims))
             elif isinstance(transform, v2.RandomCrop):
                 tc_transform = RandomCrop._from_torchvision(transform)
-                output_dims = tc_transform._calculate_output_dims(curr_input_dims)
+                output_dims = tc_transform._get_output_dims()
                 converted_transforms.append((tc_transform, curr_input_dims))
             else:
                 raise ValueError(
@@ -543,7 +543,7 @@ def _make_transform_specs(
                     "v2 transform."
                 )
 
-        curr_input_dims = output_dims
+        curr_input_dims = output_dims if output_dims is not None else curr_input_dims
 
     return ";".join([t._make_transform_spec(dims) for t, dims in converted_transforms])
 
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
@@ -43,10 +43,11 @@ def _make_transform_spec(
     ) -> str:
         pass
 
-    def _calculate_output_dims(
-        self, input_dims: Tuple[Optional[int], Optional[int]]
-    ) -> Tuple[Optional[int], Optional[int]]:
-        return input_dims
+    # Transforms that change the dimensions of their input frame return a value.
+    # Transforms that don't return None; they can rely on this default
+    # implementation.
+    def _get_output_dims(self) -> Optional[Tuple[Optional[int], Optional[int]]]:
+        return None
 
 
 def import_torchvision_transforms_v2() -> ModuleType:
@@ -80,9 +81,7 @@ def _make_transform_spec(
         assert len(self.size) == 2
         return f"resize, {self.size[0]}, {self.size[1]}"
 
-    def _calculate_output_dims(
-        self, input_dims: Tuple[Optional[int], Optional[int]]
-    ) -> Tuple[Optional[int], Optional[int]]:
+    def _get_output_dims(self) -> Optional[Tuple[Optional[int], Optional[int]]]:
         # TODO: establish this invariant in the constructor during refactor
         assert len(self.size) == 2
         return (self.size[0], self.size[1])
@@ -173,24 +172,9 @@ def _make_transform_spec(
 
         return f"crop, {self.size[0]}, {self.size[1]}, {left}, {top}"
 
-    def _calculate_output_dims(
-        self, input_dims: Tuple[Optional[int], Optional[int]]
-    ) -> Tuple[Optional[int], Optional[int]]:
+    def _get_output_dims(self) -> Optional[Tuple[Optional[int], Optional[int]]]:
         # TODO: establish this invariant in the constructor during refactor
         assert len(self.size) == 2
-
-        height, width = input_dims
-        if height is None:
-            raise ValueError(
-                "Video metadata has no height. "
-                "RandomCrop can only be used when input frame dimensions are known."
-            )
-        if width is None:
-            raise ValueError(
-                "Video metadata has no width. "
-                "RandomCrop can only be used when input frame dimensions are known."
-            )
-
         return (self.size[0], self.size[1])
 
     @classmethod
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py
@@ -257,6 +257,51 @@ def test_crop_fails(self, error_message, params):
                 transforms=[v2.RandomCrop(**params)],
             )
 
+    @pytest.mark.parametrize("seed", [0, 314])
+    def test_random_crop_reusable_objects(self, seed):
+        torch.manual_seed(seed)
+        random_crop = torchcodec.transforms.RandomCrop(size=(100, 100))
+
+        # Create a spec which causes us to calculate the random crop location.
+        _ = random_crop._make_transform_spec((1000, 1000))
+        first_top = random_crop._top
+        first_left = random_crop._left
+
+        # Create a spec again, which should calculate a different random crop
+        # location.
+        _ = random_crop._make_transform_spec((1000, 1000))
+        assert first_top != random_crop._top
+        assert first_left != random_crop._left
+
+    @pytest.mark.parametrize(
+        "resize, random_crop",
+        [
+            (torchcodec.transforms.Resize, torchcodec.transforms.RandomCrop),
+            (v2.Resize, v2.RandomCrop),
+        ],
+    )
+    def test_transform_pipeline(self, resize, random_crop):
+        decoder = VideoDecoder(
+            TEST_SRC_2_720P.path,
+            transforms=[
+                # resized to bigger than original
+                resize(size=(2160, 3840)),
+                # crop to smaller than the resize, but still bigger than original
+                random_crop(size=(1080, 1920)),
+            ],
+        )
+
+        num_frames = len(decoder)
+        for frame_index in [
+            0,
+            int(num_frames * 0.25),
+            int(num_frames * 0.5),
+            int(num_frames * 0.75),
+            num_frames - 1,
+        ]:
+            frame = decoder[frame_index]
+            assert frame.shape == (TEST_SRC_2_720P.get_num_color_channels(), 1080, 1920)
+
     def test_transform_fails(self):
         with pytest.raises(
             ValueError,
@@ -519,14 +564,14 @@ def test_crop_transform_fails(self):
 
         with pytest.raises(
             RuntimeError,
-            match="x position out of bounds",
+            match="x start position, 9999, out of bounds",
         ):
             decoder = create_from_file(str(NASA_VIDEO.path))
             add_video_stream(decoder, transform_specs="crop, 100, 100, 9999, 100")
 
         with pytest.raises(
             RuntimeError,
-            match="y position out of bounds",
+            match=r"Crop output height \(999\) is greater than input height \(270\)",
         ):
             decoder = create_from_file(str(NASA_VIDEO.path))
             add_video_stream(decoder, transform_specs="crop, 999, 100, 100, 100")