2025-12-05 nightly release (4e412b7)

pytorchbot · pytorchbot · commit d9441d02d7af · 2025-12-05T11:36:51.000Z
diff --git a/README.md b/README.md
@@ -3,8 +3,8 @@
 # TorchCodec
 
 TorchCodec is a Python library for decoding video and audio data into PyTorch
-tensors, on CPU and CUDA GPU. It also supports audio encoding, and video
-encoding will come soon!  It aims to be fast, easy to use, and well integrated
+tensors, on CPU and CUDA GPU. It also supports video and audio encoding on CPU!
+It aims to be fast, easy to use, and well integrated
 into the PyTorch ecosystem.  If you want to use PyTorch to train ML models on
 videos and audio, TorchCodec is how you turn these into data.
 
@@ -130,7 +130,8 @@ The following table indicates the compatibility between versions of
 
 | `torchcodec`       | `torch`            | Python              |
 | ------------------ | ------------------ | ------------------- |
-| `main` / `nightly` | `main` / `nightly` | `>=3.10`, `<=3.13`   |
+| `main` / `nightly` | `main` / `nightly` | `>=3.10`, `<=3.14`   |
+| `0.9`              | `2.9`              | `>=3.10`, `<=3.14`   |
 | `0.8`              | `2.9`              | `>=3.10`, `<=3.13`   |
 | `0.7`              | `2.8`              | `>=3.9`, `<=3.13`   |
 | `0.6`              | `2.8`              | `>=3.9`, `<=3.13`   |
diff --git a/docs/source/api_ref_transforms.rst b/docs/source/api_ref_transforms.rst
@@ -14,4 +14,6 @@ For a tutorial, see: TODO_DECODER_TRANSFORMS_TUTORIAL.
     :template: dataclass.rst
 
     DecoderTransform
+    CenterCrop
+    RandomCrop
     Resize
diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
@@ -37,16 +37,22 @@ std::optional<FrameDims> ResizeTransform::getOutputFrameDims() const {
   return outputDims_;
 }
 
+CropTransform::CropTransform(const FrameDims& dims) : outputDims_(dims) {}
+
 CropTransform::CropTransform(const FrameDims& dims, int x, int y)
     : outputDims_(dims), x_(x), y_(y) {
   TORCH_CHECK(x_ >= 0, "Crop x position must be >= 0, got: ", x_);
   TORCH_CHECK(y_ >= 0, "Crop y position must be >= 0, got: ", y_);
 }
 
 std::string CropTransform::getFilterGraphCpu() const {
+  // For the FFmpeg filter crop, if the x and y coordinates are left
+  // unspecified, it defaults to a center crop.
+  std::string coordinates = x_.has_value()
+      ? (":" + std::to_string(x_.value()) + ":" + std::to_string(y_.value()))
+      : "";
   return "crop=" + std::to_string(outputDims_.width) + ":" +
-      std::to_string(outputDims_.height) + ":" + std::to_string(x_) + ":" +
-      std::to_string(y_) + ":exact=1";
+      std::to_string(outputDims_.height) + coordinates + ":exact=1";
 }
 
 std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
@@ -69,29 +75,34 @@ void CropTransform::validate(const FrameDims& inputDims) const {
       inputDims.width,
       ")");
   TORCH_CHECK(
-      x_ <= inputDims.width,
-      "Crop x start position, ",
-      x_,
-      ", out of bounds of input width, ",
-      inputDims.width);
-  TORCH_CHECK(
-      x_ + outputDims_.width <= inputDims.width,
-      "Crop x end position, ",
-      x_ + outputDims_.width,
-      ", out of bounds of input width ",
-      inputDims.width);
-  TORCH_CHECK(
-      y_ <= inputDims.height,
-      "Crop y start position, ",
-      y_,
-      ", out of bounds of input height, ",
-      inputDims.height);
-  TORCH_CHECK(
-      y_ + outputDims_.height <= inputDims.height,
-      "Crop y end position, ",
-      y_ + outputDims_.height,
-      ", out of bounds of input height ",
-      inputDims.height);
+      x_.has_value() == y_.has_value(),
+      "Crop x and y values must be both set or both unset");
+  if (x_.has_value()) {
+    TORCH_CHECK(
+        x_.value() <= inputDims.width,
+        "Crop x start position, ",
+        x_.value(),
+        ", out of bounds of input width, ",
+        inputDims.width);
+    TORCH_CHECK(
+        x_.value() + outputDims_.width <= inputDims.width,
+        "Crop x end position, ",
+        x_.value() + outputDims_.width,
+        ", out of bounds of input width ",
+        inputDims.width);
+    TORCH_CHECK(
+        y_.value() <= inputDims.height,
+        "Crop y start position, ",
+        y_.value(),
+        ", out of bounds of input height, ",
+        inputDims.height);
+    TORCH_CHECK(
+        y_.value() + outputDims_.height <= inputDims.height,
+        "Crop y end position, ",
+        y_.value() + outputDims_.height,
+        ", out of bounds of input height ",
+        inputDims.height);
+  }
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Transform.h b/src/torchcodec/_core/Transform.h
@@ -61,14 +61,17 @@ class CropTransform : public Transform {
  public:
   CropTransform(const FrameDims& dims, int x, int y);
 
+  // Becomes a center crop if x and y are not specified.
+  CropTransform(const FrameDims& dims);
+
   std::string getFilterGraphCpu() const override;
   std::optional<FrameDims> getOutputFrameDims() const override;
   void validate(const FrameDims& inputDims) const override;
 
  private:
   FrameDims outputDims_;
-  int x_;
-  int y_;
+  std::optional<int> x_;
+  std::optional<int> y_;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -288,6 +288,23 @@ Transform* makeCropTransform(
   return new CropTransform(FrameDims(height, width), x, y);
 }
 
+// CenterCrop transform specs take the form:
+//
+//   "center_crop, <height>, <width>"
+//
+// Where "center_crop" is the string literal and <height>, <width> are
+// positive integers. Note that we follow the PyTorch convention of (height,
+// width) for specifying image dimensions; FFmpeg uses (width, height).
+Transform* makeCenterCropTransform(
+    const std::vector<std::string>& cropTransformSpec) {
+  TORCH_CHECK(
+      cropTransformSpec.size() == 3,
+      "cropTransformSpec must have 3 elements including its name");
+  int height = checkedToPositiveInt(cropTransformSpec[1]);
+  int width = checkedToPositiveInt(cropTransformSpec[2]);
+  return new CropTransform(FrameDims(height, width));
+}
+
 std::vector<std::string> split(const std::string& str, char delimiter) {
   std::vector<std::string> tokens;
   std::string token;
@@ -317,6 +334,8 @@ std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
       transforms.push_back(makeResizeTransform(transformSpec));
     } else if (name == "crop") {
       transforms.push_back(makeCropTransform(transformSpec));
+    } else if (name == "center_crop") {
+      transforms.push_back(makeCenterCropTransform(transformSpec));
     } else {
       TORCH_CHECK(false, "Invalid transform name: " + name);
     }
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -19,7 +19,8 @@
     create_decoder,
     ERROR_REPORTING_INSTRUCTIONS,
 )
-from torchcodec.transforms import DecoderTransform, RandomCrop, Resize
+from torchcodec.transforms import DecoderTransform
+from torchcodec.transforms._decoder_transforms import _make_transform_specs
 
 
 class VideoDecoder:
@@ -451,102 +452,6 @@ def _get_and_validate_stream_metadata(
     )
 
 
-def _make_transform_specs(
-    transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]],
-    input_dims: Tuple[Optional[int], Optional[int]],
-) -> str:
-    """Given a sequence of transforms, turn those into the specification string
-       the core API expects.
-
-    Args:
-        transforms: Optional sequence of transform objects. The objects can be
-            one of two types:
-                1. torchcodec.transforms.DecoderTransform
-                2. torchvision.transforms.v2.Transform, but our type annotation
-                   only mentions its base, nn.Module. We don't want to take a
-                   hard dependency on TorchVision.
-        input_dims: Optional (height, width) pair. Note that only some
-            transforms need to know the dimensions. If the user provides
-            transforms that don't need to know the dimensions, and that metadata
-            is missing, everything should still work. That means we assert their
-            existence as late as possible.
-
-    Returns:
-        String of transforms in the format the core API expects: transform
-        specifications separate by semicolons.
-    """
-    if transforms is None:
-        return ""
-
-    try:
-        from torchvision.transforms import v2
-
-        tv_available = True
-    except ImportError:
-        tv_available = False
-
-    # The following loop accomplishes two tasks:
-    #
-    #     1. Converts the transform to a DecoderTransform, if necessary. We
-    #        accept TorchVision transform objects and they must be converted
-    #        to their matching DecoderTransform.
-    #     2. Calculates what the input dimensions are to each transform.
-    #
-    # The order in our transforms list is semantically meaningful, as we
-    # actually have a pipeline where the output of one transform is the input to
-    # the next. For example, if we have the transforms list [A, B, C, D], then
-    # we should understand that as:
-    #
-    #     A -> B -> C -> D
-    #
-    # Where the frame produced by A is the input to B, the frame produced by B
-    # is the input to C, etc. This particularly matters for frame dimensions.
-    # Transforms can both:
-    #
-    #     1. Produce frames with arbitrary dimensions.
-    #     2. Rely on their input frame's dimensions to calculate ahead-of-time
-    #        what their runtime behavior will be.
-    #
-    # The consequence of the above facts is that we need to statically track
-    # frame dimensions in the pipeline while we pre-process it. The input
-    # frame's dimensions to A, our first transform, is always what we know from
-    # our metadata. For each transform, we always calculate its output
-    # dimensions from its input dimensions. We store these with the converted
-    # transform, to be all used together when we generate the specs.
-    converted_transforms: list[
-        Tuple[
-            DecoderTransform,
-            # A (height, width) pair where the values may be missing.
-            Tuple[Optional[int], Optional[int]],
-        ]
-    ] = []
-    curr_input_dims = input_dims
-    for transform in transforms:
-        if not isinstance(transform, DecoderTransform):
-            if not tv_available:
-                raise ValueError(
-                    f"The supplied transform, {transform}, is not a TorchCodec "
-                    " DecoderTransform. TorchCodec also accepts TorchVision "
-                    "v2 transforms, but TorchVision is not installed."
-                )
-            elif isinstance(transform, v2.Resize):
-                transform = Resize._from_torchvision(transform)
-            elif isinstance(transform, v2.RandomCrop):
-                transform = RandomCrop._from_torchvision(transform)
-            else:
-                raise ValueError(
-                    f"Unsupported transform: {transform}. Transforms must be "
-                    "either a TorchCodec DecoderTransform or a TorchVision "
-                    "v2 transform."
-                )
-
-        converted_transforms.append((transform, curr_input_dims))
-        output_dims = transform._get_output_dims()
-        curr_input_dims = output_dims if output_dims is not None else curr_input_dims
-
-    return ";".join([t._make_transform_spec(dims) for t, dims in converted_transforms])
-
-
 def _read_custom_frame_mappings(
     custom_frame_mappings: Union[str, bytes, io.RawIOBase, io.BufferedReader]
 ) -> tuple[Tensor, Tensor, Tensor]:
diff --git a/src/torchcodec/transforms/__init__.py b/src/torchcodec/transforms/__init__.py
@@ -4,4 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._decoder_transforms import DecoderTransform, RandomCrop, Resize  # noqa
+from ._decoder_transforms import (  # noqa
+    CenterCrop,
+    DecoderTransform,
+    RandomCrop,
+    Resize,
+)
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py
diff --git a/version.txt b/version.txt