Implement CenterCrop (#1094)

scotts · web-flow · commit 7bc569a4deee · 2025-12-04T11:41:41.000-05:00
diff --git a/docs/source/api_ref_transforms.rst b/docs/source/api_ref_transforms.rst
@@ -14,5 +14,6 @@ For a tutorial, see: TODO_DECODER_TRANSFORMS_TUTORIAL.
     :template: dataclass.rst
 
     DecoderTransform
+    CenterCrop
     RandomCrop
     Resize
diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
@@ -37,16 +37,22 @@ std::optional<FrameDims> ResizeTransform::getOutputFrameDims() const {
   return outputDims_;
 }
 
+CropTransform::CropTransform(const FrameDims& dims) : outputDims_(dims) {}
+
 CropTransform::CropTransform(const FrameDims& dims, int x, int y)
     : outputDims_(dims), x_(x), y_(y) {
   TORCH_CHECK(x_ >= 0, "Crop x position must be >= 0, got: ", x_);
   TORCH_CHECK(y_ >= 0, "Crop y position must be >= 0, got: ", y_);
 }
 
 std::string CropTransform::getFilterGraphCpu() const {
+  // For the FFmpeg filter crop, if the x and y coordinates are left
+  // unspecified, it defaults to a center crop.
+  std::string coordinates = x_.has_value()
+      ? (":" + std::to_string(x_.value()) + ":" + std::to_string(y_.value()))
+      : "";
   return "crop=" + std::to_string(outputDims_.width) + ":" +
-      std::to_string(outputDims_.height) + ":" + std::to_string(x_) + ":" +
-      std::to_string(y_) + ":exact=1";
+      std::to_string(outputDims_.height) + coordinates + ":exact=1";
 }
 
 std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
@@ -69,29 +75,34 @@ void CropTransform::validate(const FrameDims& inputDims) const {
       inputDims.width,
       ")");
   TORCH_CHECK(
-      x_ <= inputDims.width,
-      "Crop x start position, ",
-      x_,
-      ", out of bounds of input width, ",
-      inputDims.width);
-  TORCH_CHECK(
-      x_ + outputDims_.width <= inputDims.width,
-      "Crop x end position, ",
-      x_ + outputDims_.width,
-      ", out of bounds of input width ",
-      inputDims.width);
-  TORCH_CHECK(
-      y_ <= inputDims.height,
-      "Crop y start position, ",
-      y_,
-      ", out of bounds of input height, ",
-      inputDims.height);
-  TORCH_CHECK(
-      y_ + outputDims_.height <= inputDims.height,
-      "Crop y end position, ",
-      y_ + outputDims_.height,
-      ", out of bounds of input height ",
-      inputDims.height);
+      x_.has_value() == y_.has_value(),
+      "Crop x and y values must be both set or both unset");
+  if (x_.has_value()) {
+    TORCH_CHECK(
+        x_.value() <= inputDims.width,
+        "Crop x start position, ",
+        x_.value(),
+        ", out of bounds of input width, ",
+        inputDims.width);
+    TORCH_CHECK(
+        x_.value() + outputDims_.width <= inputDims.width,
+        "Crop x end position, ",
+        x_.value() + outputDims_.width,
+        ", out of bounds of input width ",
+        inputDims.width);
+    TORCH_CHECK(
+        y_.value() <= inputDims.height,
+        "Crop y start position, ",
+        y_.value(),
+        ", out of bounds of input height, ",
+        inputDims.height);
+    TORCH_CHECK(
+        y_.value() + outputDims_.height <= inputDims.height,
+        "Crop y end position, ",
+        y_.value() + outputDims_.height,
+        ", out of bounds of input height ",
+        inputDims.height);
+  }
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Transform.h b/src/torchcodec/_core/Transform.h
@@ -61,14 +61,17 @@ class CropTransform : public Transform {
  public:
   CropTransform(const FrameDims& dims, int x, int y);
 
+  // Becomes a center crop if x and y are not specified.
+  CropTransform(const FrameDims& dims);
+
   std::string getFilterGraphCpu() const override;
   std::optional<FrameDims> getOutputFrameDims() const override;
   void validate(const FrameDims& inputDims) const override;
 
  private:
   FrameDims outputDims_;
-  int x_;
-  int y_;
+  std::optional<int> x_;
+  std::optional<int> y_;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -288,6 +288,23 @@ Transform* makeCropTransform(
   return new CropTransform(FrameDims(height, width), x, y);
 }
 
+// CenterCrop transform specs take the form:
+//
+//   "center_crop, <height>, <width>"
+//
+// Where "center_crop" is the string literal and <height>, <width> are
+// positive integers. Note that we follow the PyTorch convention of (height,
+// width) for specifying image dimensions; FFmpeg uses (width, height).
+Transform* makeCenterCropTransform(
+    const std::vector<std::string>& cropTransformSpec) {
+  TORCH_CHECK(
+      cropTransformSpec.size() == 3,
+      "cropTransformSpec must have 3 elements including its name");
+  int height = checkedToPositiveInt(cropTransformSpec[1]);
+  int width = checkedToPositiveInt(cropTransformSpec[2]);
+  return new CropTransform(FrameDims(height, width));
+}
+
 std::vector<std::string> split(const std::string& str, char delimiter) {
   std::vector<std::string> tokens;
   std::string token;
@@ -317,6 +334,8 @@ std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
       transforms.push_back(makeResizeTransform(transformSpec));
     } else if (name == "crop") {
       transforms.push_back(makeCropTransform(transformSpec));
+    } else if (name == "center_crop") {
+      transforms.push_back(makeCenterCropTransform(transformSpec));
     } else {
       TORCH_CHECK(false, "Invalid transform name: " + name);
     }
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -19,7 +19,7 @@
     create_decoder,
     ERROR_REPORTING_INSTRUCTIONS,
 )
-from torchcodec.transforms import DecoderTransform, RandomCrop, Resize
+from torchcodec.transforms import CenterCrop, DecoderTransform, RandomCrop, Resize
 
 
 class VideoDecoder:
@@ -531,6 +531,8 @@ def _make_transform_specs(
                 )
             elif isinstance(transform, v2.Resize):
                 transform = Resize._from_torchvision(transform)
+            elif isinstance(transform, v2.CenterCrop):
+                transform = CenterCrop._from_torchvision(transform)
             elif isinstance(transform, v2.RandomCrop):
                 transform = RandomCrop._from_torchvision(transform)
             else:
diff --git a/src/torchcodec/transforms/__init__.py b/src/torchcodec/transforms/__init__.py
@@ -4,4 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._decoder_transforms import DecoderTransform, RandomCrop, Resize  # noqa
+from ._decoder_transforms import (  # noqa
+    CenterCrop,
+    DecoderTransform,
+    RandomCrop,
+    Resize,
+)
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
@@ -140,6 +140,55 @@ def _from_torchvision(cls, tv_resize: nn.Module):
         return cls(size=tv_resize.size)
 
 
+class CenterCrop(DecoderTransform):
+    """Crop the decoded frame to a given size in the center of the frame.
+
+    Complementary TorchVision transform: :class:`~torchvision.transforms.v2.CenterCrop`.
+
+    Args:
+        size (Sequence[int]): Desired output size. Must be a sequence of
+            the form (height, width).
+    """
+
+    def __init__(self, size: Sequence[int]):
+        if len(size) != 2:
+            raise ValueError(
+                "CenterCrop transform must have a (height, width) "
+                f"pair for the size, got {size}."
+            )
+        self.size = size
+
+    def _make_transform_spec(
+        self, input_dims: Tuple[Optional[int], Optional[int]]
+    ) -> str:
+        return f"center_crop, {self.size[0]}, {self.size[1]}"
+
+    def _get_output_dims(self) -> Optional[Tuple[Optional[int], Optional[int]]]:
+        return (self.size[0], self.size[1])
+
+    @classmethod
+    def _from_torchvision(
+        cls,
+        tv_center_crop: nn.Module,
+    ):
+        v2 = import_torchvision_transforms_v2()
+
+        if not isinstance(tv_center_crop, v2.CenterCrop):
+            raise ValueError(
+                "Transform must be TorchVision's CenterCrop, "
+                f"it is instead {type(tv_center_crop).__name__}. "
+                "This should never happen, please report a bug."
+            )
+
+        if len(tv_center_crop.size) != 2:
+            raise ValueError(
+                "TorchVision CenterCrop transform must have a (height, width) "
+                f"pair for the size, got {tv_center_crop.size}."
+            )
+
+        return cls(size=tv_center_crop.size)
+
+
 class RandomCrop(DecoderTransform):
     """Crop the decoded frame to a given size at a random location in the frame.
 
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py
@@ -154,6 +154,61 @@ def test_resize_fails(self):
                 transforms=[torchcodec.transforms.Resize(size=(100, 100, 100))],
             )
 
+    @pytest.mark.parametrize(
+        "height_scaling_factor, width_scaling_factor",
+        ((0.5, 0.5), (0.25, 0.1), (1.0, 1.0), (0.15, 0.75)),
+    )
+    @pytest.mark.parametrize("video", [NASA_VIDEO, TEST_SRC_2_720P])
+    def test_center_crop_torchvision(
+        self,
+        height_scaling_factor,
+        width_scaling_factor,
+        video,
+    ):
+        height = int(video.get_height() * height_scaling_factor)
+        width = int(video.get_width() * width_scaling_factor)
+
+        tc_center_crop = torchcodec.transforms.CenterCrop(size=(height, width))
+        decoder_center_crop = VideoDecoder(video.path, transforms=[tc_center_crop])
+
+        decoder_center_crop_tv = VideoDecoder(
+            video.path,
+            transforms=[v2.CenterCrop(size=(height, width))],
+        )
+
+        decoder_full = VideoDecoder(video.path)
+
+        num_frames = len(decoder_center_crop_tv)
+        assert num_frames == len(decoder_full)
+
+        for frame_index in [
+            0,
+            int(num_frames * 0.25),
+            int(num_frames * 0.5),
+            int(num_frames * 0.75),
+            num_frames - 1,
+        ]:
+            frame_center_crop = decoder_center_crop[frame_index]
+            frame_center_crop_tv = decoder_center_crop_tv[frame_index]
+            assert_frames_equal(frame_center_crop, frame_center_crop_tv)
+
+            expected_shape = (video.get_num_color_channels(), height, width)
+            assert frame_center_crop_tv.shape == expected_shape
+
+            frame_full = decoder_full[frame_index]
+            frame_tv = v2.CenterCrop(size=(height, width))(frame_full)
+            assert_frames_equal(frame_center_crop, frame_tv)
+
+    def test_center_crop_fails(self):
+        with pytest.raises(
+            ValueError,
+            match=r"must have a \(height, width\) pair for the size",
+        ):
+            VideoDecoder(
+                NASA_VIDEO.path,
+                transforms=[torchcodec.transforms.CenterCrop(size=(100,))],
+            )
+
     @pytest.mark.parametrize(
         "height_scaling_factor, width_scaling_factor",
         ((0.5, 0.5), (0.25, 0.1), (1.0, 1.0), (0.15, 0.75)),
@@ -257,7 +312,7 @@ def test_random_crop_nhwc(
             ),
         ),
     )
-    def test_crop_fails(self, error_message, params):
+    def test_random_crop_fails(self, error_message, params):
         with pytest.raises(
             ValueError,
             match=error_message,