Merge branch 'meta-pytorch:main' into cpu-fallback

mollyxu · web-flow · commit f353758122a4 · 2025-12-05T00:26:31.000-05:00
diff --git a/README.md b/README.md
@@ -3,8 +3,8 @@
 # TorchCodec
 
 TorchCodec is a Python library for decoding video and audio data into PyTorch
-tensors, on CPU and CUDA GPU. It also supports audio encoding, and video
-encoding will come soon!  It aims to be fast, easy to use, and well integrated
+tensors, on CPU and CUDA GPU. It also supports video and audio encoding on CPU!
+It aims to be fast, easy to use, and well integrated
 into the PyTorch ecosystem.  If you want to use PyTorch to train ML models on
 videos and audio, TorchCodec is how you turn these into data.
 
@@ -130,7 +130,8 @@ The following table indicates the compatibility between versions of
 
 | `torchcodec`       | `torch`            | Python              |
 | ------------------ | ------------------ | ------------------- |
-| `main` / `nightly` | `main` / `nightly` | `>=3.10`, `<=3.13`   |
+| `main` / `nightly` | `main` / `nightly` | `>=3.10`, `<=3.14`   |
+| `0.9`              | `2.9`              | `>=3.10`, `<=3.14`   |
 | `0.8`              | `2.9`              | `>=3.10`, `<=3.13`   |
 | `0.7`              | `2.8`              | `>=3.9`, `<=3.13`   |
 | `0.6`              | `2.8`              | `>=3.9`, `<=3.13`   |
diff --git a/docs/source/api_ref_transforms.rst b/docs/source/api_ref_transforms.rst
@@ -14,4 +14,6 @@ For a tutorial, see: TODO_DECODER_TRANSFORMS_TUTORIAL.
     :template: dataclass.rst
 
     DecoderTransform
+    CenterCrop
+    RandomCrop
     Resize
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -545,12 +545,14 @@ void SingleStreamDecoder::addVideoStream(
 
   metadataDims_ =
       FrameDims(streamMetadata.height.value(), streamMetadata.width.value());
+  FrameDims currInputDims = metadataDims_;
   for (auto& transform : transforms) {
     TORCH_CHECK(transform != nullptr, "Transforms should never be nullptr!");
     if (transform->getOutputFrameDims().has_value()) {
       resizedOutputDims_ = transform->getOutputFrameDims().value();
     }
-    transform->validate(streamMetadata);
+    transform->validate(currInputDims);
+    currInputDims = resizedOutputDims_.value_or(metadataDims_);
 
     // Note that we are claiming ownership of the transform objects passed in to
     // us.
diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
@@ -37,31 +37,72 @@ std::optional<FrameDims> ResizeTransform::getOutputFrameDims() const {
   return outputDims_;
 }
 
+CropTransform::CropTransform(const FrameDims& dims) : outputDims_(dims) {}
+
 CropTransform::CropTransform(const FrameDims& dims, int x, int y)
     : outputDims_(dims), x_(x), y_(y) {
   TORCH_CHECK(x_ >= 0, "Crop x position must be >= 0, got: ", x_);
   TORCH_CHECK(y_ >= 0, "Crop y position must be >= 0, got: ", y_);
 }
 
 std::string CropTransform::getFilterGraphCpu() const {
+  // For the FFmpeg filter crop, if the x and y coordinates are left
+  // unspecified, it defaults to a center crop.
+  std::string coordinates = x_.has_value()
+      ? (":" + std::to_string(x_.value()) + ":" + std::to_string(y_.value()))
+      : "";
   return "crop=" + std::to_string(outputDims_.width) + ":" +
-      std::to_string(outputDims_.height) + ":" + std::to_string(x_) + ":" +
-      std::to_string(y_) + ":exact=1";
+      std::to_string(outputDims_.height) + coordinates + ":exact=1";
 }
 
 std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
   return outputDims_;
 }
 
-void CropTransform::validate(const StreamMetadata& streamMetadata) const {
-  TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds");
+void CropTransform::validate(const FrameDims& inputDims) const {
+  TORCH_CHECK(
+      outputDims_.height <= inputDims.height,
+      "Crop output height (",
+      outputDims_.height,
+      ") is greater than input height (",
+      inputDims.height,
+      ")");
   TORCH_CHECK(
-      x_ + outputDims_.width <= streamMetadata.width,
-      "Crop x position out of bounds")
-  TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds");
+      outputDims_.width <= inputDims.width,
+      "Crop output width (",
+      outputDims_.width,
+      ") is greater than input width (",
+      inputDims.width,
+      ")");
   TORCH_CHECK(
-      y_ + outputDims_.height <= streamMetadata.height,
-      "Crop y position out of bounds");
+      x_.has_value() == y_.has_value(),
+      "Crop x and y values must be both set or both unset");
+  if (x_.has_value()) {
+    TORCH_CHECK(
+        x_.value() <= inputDims.width,
+        "Crop x start position, ",
+        x_.value(),
+        ", out of bounds of input width, ",
+        inputDims.width);
+    TORCH_CHECK(
+        x_.value() + outputDims_.width <= inputDims.width,
+        "Crop x end position, ",
+        x_.value() + outputDims_.width,
+        ", out of bounds of input width ",
+        inputDims.width);
+    TORCH_CHECK(
+        y_.value() <= inputDims.height,
+        "Crop y start position, ",
+        y_.value(),
+        ", out of bounds of input height, ",
+        inputDims.height);
+    TORCH_CHECK(
+        y_.value() + outputDims_.height <= inputDims.height,
+        "Crop y end position, ",
+        y_.value() + outputDims_.height,
+        ", out of bounds of input height ",
+        inputDims.height);
+  }
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Transform.h b/src/torchcodec/_core/Transform.h
@@ -36,8 +36,7 @@ class Transform {
   //
   // Note that the validation function does not return anything. We expect
   // invalid configurations to throw an exception.
-  virtual void validate(
-      [[maybe_unused]] const StreamMetadata& streamMetadata) const {}
+  virtual void validate([[maybe_unused]] const FrameDims& inputDims) const {}
 };
 
 class ResizeTransform : public Transform {
@@ -62,14 +61,17 @@ class CropTransform : public Transform {
  public:
   CropTransform(const FrameDims& dims, int x, int y);
 
+  // Becomes a center crop if x and y are not specified.
+  CropTransform(const FrameDims& dims);
+
   std::string getFilterGraphCpu() const override;
   std::optional<FrameDims> getOutputFrameDims() const override;
-  void validate(const StreamMetadata& streamMetadata) const override;
+  void validate(const FrameDims& inputDims) const override;
 
  private:
   FrameDims outputDims_;
-  int x_;
-  int y_;
+  std::optional<int> x_;
+  std::optional<int> y_;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -239,6 +239,19 @@ int checkedToPositiveInt(const std::string& str) {
   return ret;
 }
 
+int checkedToNonNegativeInt(const std::string& str) {
+  int ret = 0;
+  try {
+    ret = std::stoi(str);
+  } catch (const std::invalid_argument&) {
+    TORCH_CHECK(false, "String cannot be converted to an int:" + str);
+  } catch (const std::out_of_range&) {
+    TORCH_CHECK(false, "String would become integer out of range:" + str);
+  }
+  TORCH_CHECK(ret >= 0, "String must be a non-negative integer:" + str);
+  return ret;
+}
+
 // Resize transform specs take the form:
 //
 //   "resize, <height>, <width>"
@@ -270,11 +283,28 @@ Transform* makeCropTransform(
       "cropTransformSpec must have 5 elements including its name");
   int height = checkedToPositiveInt(cropTransformSpec[1]);
   int width = checkedToPositiveInt(cropTransformSpec[2]);
-  int x = checkedToPositiveInt(cropTransformSpec[3]);
-  int y = checkedToPositiveInt(cropTransformSpec[4]);
+  int x = checkedToNonNegativeInt(cropTransformSpec[3]);
+  int y = checkedToNonNegativeInt(cropTransformSpec[4]);
   return new CropTransform(FrameDims(height, width), x, y);
 }
 
+// CenterCrop transform specs take the form:
+//
+//   "center_crop, <height>, <width>"
+//
+// Where "center_crop" is the string literal and <height>, <width> are
+// positive integers. Note that we follow the PyTorch convention of (height,
+// width) for specifying image dimensions; FFmpeg uses (width, height).
+Transform* makeCenterCropTransform(
+    const std::vector<std::string>& cropTransformSpec) {
+  TORCH_CHECK(
+      cropTransformSpec.size() == 3,
+      "cropTransformSpec must have 3 elements including its name");
+  int height = checkedToPositiveInt(cropTransformSpec[1]);
+  int width = checkedToPositiveInt(cropTransformSpec[2]);
+  return new CropTransform(FrameDims(height, width));
+}
+
 std::vector<std::string> split(const std::string& str, char delimiter) {
   std::vector<std::string> tokens;
   std::string token;
@@ -304,6 +334,8 @@ std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
       transforms.push_back(makeResizeTransform(transformSpec));
     } else if (name == "crop") {
       transforms.push_back(makeCropTransform(transformSpec));
+    } else if (name == "center_crop") {
+      transforms.push_back(makeCenterCropTransform(transformSpec));
     } else {
       TORCH_CHECK(false, "Invalid transform name: " + name);
     }
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -9,7 +9,7 @@
 import numbers
 from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Literal, Optional, Sequence, Tuple, Union
+from typing import Literal, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import device as torch_device, nn, Tensor
@@ -20,7 +20,8 @@
     create_decoder,
     ERROR_REPORTING_INSTRUCTIONS,
 )
-from torchcodec.transforms import DecoderTransform, Resize
+from torchcodec.transforms import DecoderTransform
+from torchcodec.transforms._decoder_transforms import _make_transform_specs
 
 
 @dataclass
@@ -217,7 +218,10 @@ def __init__(
             device = str(device)
 
         device_variant = _get_cuda_backend()
-        transform_specs = _make_transform_specs(transforms)
+        transform_specs = _make_transform_specs(
+            transforms,
+            input_dims=(self.metadata.height, self.metadata.width),
+        )
 
         core.add_video_stream(
             self._decoder,
@@ -523,78 +527,6 @@ def _get_and_validate_stream_metadata(
     )
 
 
-def _convert_to_decoder_transforms(
-    transforms: Sequence[Union[DecoderTransform, nn.Module]],
-) -> List[DecoderTransform]:
-    """Convert a sequence of transforms that may contain TorchVision transform
-    objects into a list of only TorchCodec transform objects.
-
-    Args:
-        transforms: Squence of transform objects. The objects can be one of two
-        types:
-                1. torchcodec.transforms.DecoderTransform
-                2. torchvision.transforms.v2.Transform, but our type annotation
-                   only mentions its base, nn.Module. We don't want to take a
-                   hard dependency on TorchVision.
-
-    Returns:
-        List of DecoderTransform objects.
-    """
-    try:
-        from torchvision.transforms import v2
-
-        tv_available = True
-    except ImportError:
-        tv_available = False
-
-    converted_transforms: list[DecoderTransform] = []
-    for transform in transforms:
-        if not isinstance(transform, DecoderTransform):
-            if not tv_available:
-                raise ValueError(
-                    f"The supplied transform, {transform}, is not a TorchCodec "
-                    " DecoderTransform. TorchCodec also accept TorchVision "
-                    "v2 transforms, but TorchVision is not installed."
-                )
-            elif isinstance(transform, v2.Resize):
-                converted_transforms.append(Resize._from_torchvision(transform))
-            else:
-                raise ValueError(
-                    f"Unsupported transform: {transform}. Transforms must be "
-                    "either a TorchCodec DecoderTransform or a TorchVision "
-                    "v2 transform."
-                )
-        else:
-            converted_transforms.append(transform)
-
-    return converted_transforms
-
-
-def _make_transform_specs(
-    transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]],
-) -> str:
-    """Given a sequence of transforms, turn those into the specification string
-       the core API expects.
-
-    Args:
-        transforms: Optional sequence of transform objects. The objects can be
-            one of two types:
-                1. torchcodec.transforms.DecoderTransform
-                2. torchvision.transforms.v2.Transform, but our type annotation
-                   only mentions its base, nn.Module. We don't want to take a
-                   hard dependency on TorchVision.
-
-    Returns:
-        String of transforms in the format the core API expects: transform
-        specifications separate by semicolons.
-    """
-    if transforms is None:
-        return ""
-
-    transforms = _convert_to_decoder_transforms(transforms)
-    return ";".join([t._make_transform_spec() for t in transforms])
-
-
 def _read_custom_frame_mappings(
     custom_frame_mappings: Union[str, bytes, io.RawIOBase, io.BufferedReader]
 ) -> tuple[Tensor, Tensor, Tensor]:
diff --git a/src/torchcodec/transforms/__init__.py b/src/torchcodec/transforms/__init__.py
@@ -4,4 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._decoder_transforms import DecoderTransform, Resize  # noqa
+from ._decoder_transforms import (  # noqa
+    CenterCrop,
+    DecoderTransform,
+    RandomCrop,
+    Resize,
+)
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py
diff --git a/version.txt b/version.txt