meta-pytorch · scotts · Oct 17, 2025 · Oct 10, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/.github/workflows/linux_wheel.yaml b/.github/workflows/linux_wheel.yaml
@@ -81,7 +81,7 @@ jobs:
         run: python -m pip install --upgrade pip
       - name: Install PyTorch
         run: |
-          python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
       - name: Install torchcodec from the wheel
         run: |
           wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`

diff --git a/.github/workflows/macos_wheel.yaml b/.github/workflows/macos_wheel.yaml
@@ -86,7 +86,7 @@ jobs:
 
       - name: Install PyTorch
         run: |
-          python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
 
       - name: Install torchcodec from the wheel
         run: |

diff --git a/.github/workflows/windows_wheel.yaml b/.github/workflows/windows_wheel.yaml
@@ -90,7 +90,7 @@ jobs:
         run: python -m pip install --upgrade pip
       - name: Install PyTorch
         run: |
-          python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
       - name: Install torchcodec from the wheel
         run: |
           wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`

diff --git a/src/torchcodec/_core/FilterGraph.cpp b/src/torchcodec/_core/FilterGraph.cpp
@@ -130,7 +130,8 @@ FilterGraph::FilterGraph(
   TORCH_CHECK(
       status >= 0,
       "Failed to configure filter graph: ",
-      getFFMPEGErrorStringFromErrorCode(status));
+      getFFMPEGErrorStringFromErrorCode(status),
+      ", provided filters: " + filtersContext.filtergraphStr);
 }
 
 UniqueAVFrame FilterGraph::convert(const UniqueAVFrame& avFrame) {

diff --git a/src/torchcodec/_core/Frame.cpp b/src/torchcodec/_core/Frame.cpp
@@ -8,6 +8,11 @@
 
 namespace facebook::torchcodec {
 
+FrameDims::FrameDims(int height, int width) : height(height), width(width) {
+  TORCH_CHECK(height > 0, "FrameDims.height must be > 0, got: ", height);
+  TORCH_CHECK(width > 0, "FrameDims.width must be > 0, got: ", width);
+}
+
 FrameBatchOutput::FrameBatchOutput(
     int64_t numFrames,
     const FrameDims& outputDims,

diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h
@@ -19,7 +19,7 @@ struct FrameDims {
 
   FrameDims() = default;
 
-  FrameDims(int h, int w) : height(h), width(w) {}
+  FrameDims(int h, int w);
 };
 
 // All public video decoding entry points return either a FrameOutput or a

diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -12,6 +12,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <string_view>
+#include "Metadata.h"
 #include "torch/types.h"
 
 namespace facebook::torchcodec {
@@ -523,6 +524,7 @@ void SingleStreamDecoder::addVideoStream(
     if (transform->getOutputFrameDims().has_value()) {
       resizedOutputDims_ = transform->getOutputFrameDims().value();
     }
+    transform->validate(streamMetadata);
 
     // Note that we are claiming ownership of the transform objects passed in to
     // us.

diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
@@ -57,4 +57,25 @@ int ResizeTransform::getSwsFlags() const {
   return toSwsInterpolation(interpolationMode_);
 }
 
+CropTransform::CropTransform(const FrameDims& dims, int x, int y)
+    : outputDims_(dims), x_(x), y_(y) {
+  TORCH_CHECK(x_ >= 0, "Crop x position must be positive, got: ", x_);
+  TORCH_CHECK(y_ >= 0, "Crop y position must be positive, got: ", y_);
+}
+
+std::string CropTransform::getFilterGraphCpu() const {
+  return "crop=" + std::to_string(outputDims_.width) + ":" +
+      std::to_string(outputDims_.height) + ":" + std::to_string(x_) + ":" +
+      std::to_string(y_) + ":exact=1";
+}
+
+std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
+  return outputDims_;
+}
+
+void CropTransform::validate(const StreamMetadata& streamMetadata) const {
+  TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds");
+  TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds");
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Transform.h b/src/torchcodec/_core/Transform.h
@@ -8,7 +8,9 @@
 
 #include <optional>
 #include <string>
+#include "Metadata.h"
 #include "src/torchcodec/_core/Frame.h"
+#include "src/torchcodec/_core/Metadata.h"
 
 namespace facebook::torchcodec {
 
@@ -33,6 +35,16 @@ class Transform {
   virtual bool isResize() const {
     return false;
   }
+
+  // The validity of some transforms depends on the characteristics of the
+  // AVStream they're being applied to. For example, some transforms will
+  // specify coordinates inside a frame, we need to validate that those are
+  // within the frame's bounds.
+  //
+  // Note that the validation function does not return anything. We expect
+  // invalid configurations to throw an exception.
+  virtual void validate(
+      [[maybe_unused]] const StreamMetadata& streamMetadata) const {}
 };
 
 class ResizeTransform : public Transform {
@@ -56,4 +68,18 @@ class ResizeTransform : public Transform {
   InterpolationMode interpolationMode_;
 };
 
+class CropTransform : public Transform {
+ public:
+  CropTransform(const FrameDims& dims, int x, int y);
+
+  std::string getFilterGraphCpu() const override;
+  std::optional<FrameDims> getOutputFrameDims() const override;
+  void validate(const StreamMetadata& streamMetadata) const override;
+
+ private:
+  FrameDims outputDims_;
+  int x_;
+  int y_;
+};
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -212,6 +212,26 @@ Transform* makeResizeTransform(
   return new ResizeTransform(FrameDims(height, width));
 }
 
+// Crop transform specs take the form:
+//
+//   "crop, <width>, <height>, <x>, <y>"
+//
+// Where "crop" is the string literal and <width>, <height>, <x> and <y> are
+// positive integers. Note that that in this spec, we are following the
+// filtergraph convention of (width, height). This makes it easier to compare it
+// against actual filtergraph strings.
+Transform* makeCropTransform(
+    const std::vector<std::string>& cropTransformSpec) {
+  TORCH_CHECK(
+      cropTransformSpec.size() == 5,
+      "cropTransformSpec must have 5 elements including its name");
+  int width = checkedToPositiveInt(cropTransformSpec[1]);
+  int height = checkedToPositiveInt(cropTransformSpec[2]);
+  int x = checkedToPositiveInt(cropTransformSpec[3]);
+  int y = checkedToPositiveInt(cropTransformSpec[4]);
+  return new CropTransform(FrameDims(height, width), x, y);
+}
+
 std::vector<std::string> split(const std::string& str, char delimiter) {
   std::vector<std::string> tokens;
   std::string token;
@@ -239,6 +259,8 @@ std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
     auto name = transformSpec[0];
     if (name == "resize") {
       transforms.push_back(makeResizeTransform(transformSpec));
+    } else if (name == "crop") {
+      transforms.push_back(makeCropTransform(transformSpec));
     } else {
       TORCH_CHECK(false, "Invalid transform name: " + name);
     }

diff --git a/test/generate_reference_resources.py b/test/generate_reference_resources.py
@@ -15,6 +15,10 @@
 # Run this script to update the resources used in unit tests. The resources are all derived
 # from source media already checked into the repo.
 
+SCRIPT_DIR = Path(__file__).resolve().parent
+TORCHCODEC_PATH = SCRIPT_DIR.parent
+RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"
+
 
 def convert_image_to_tensor(image_path):
     image_path = Path(image_path)
@@ -31,7 +35,18 @@ def convert_image_to_tensor(image_path):
     image_path.unlink()
 
 
-def get_frame_by_index(video_path, frame, output_path, stream):
+def get_frame_by_index(video_path, frame, output_path, stream, filters=None):
+    # Note that we have an exlicit format conversion to rgb24 in our filtergraph specification,
+    # which always happens BEFORE any of the filters that we receive as input. We do this to
+    # ensure that the color conversion happens BEFORE the filters, matching the behavior of the
+    # torchcodec filtergraph implementation.
+    #
+    # Not doing this would result in the color conversion happening AFTER the filters, which
+    # would result in different color values for the same frame.
+    filtergraph = f"select='eq(n\\,{frame})',format=rgb24"
+    if filters is not None:
+        filtergraph = filtergraph + f",{filters}"
+
     cmd = [
         "ffmpeg",
         "-y",
@@ -40,13 +55,14 @@ def get_frame_by_index(video_path, frame, output_path, stream):
         "-map",
         f"0:{stream}",
         "-vf",
-        f"select=eq(n\\,{frame})",
-        "-vsync",
-        "vfr",
-        "-q:v",
-        "2",
+        filtergraph,
+        "-fps_mode",
+        "passthrough",
+        "-update",
+        "1",
         output_path,
     ]
+    print("===" + " ".join([str(x) for x in cmd]))
     subprocess.run(cmd, check=True)
 
 
@@ -65,14 +81,9 @@ def get_frame_by_timestamp(video_path, timestamp, output_path):
     subprocess.run(cmd, check=True)
 
 
-def main():
-    SCRIPT_DIR = Path(__file__).resolve().parent
-    TORCHCODEC_PATH = SCRIPT_DIR.parent
-    RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"
+def generate_nasa_13013_references():
     VIDEO_PATH = RESOURCES_DIR / "nasa_13013.mp4"
 
-    # Last generated with ffmpeg version 4.3
-    #
     # Note: The naming scheme used here must match the naming scheme used to load
     # tensors in ./utils.py.
     STREAMS = [0, 3]
@@ -95,6 +106,18 @@ def main():
         get_frame_by_timestamp(VIDEO_PATH, timestamp, output_bmp)
         convert_image_to_tensor(output_bmp)
 
+    # Extract frames with specific filters. We have tests that assume these exact filters.
+    # We prepend format=rgb24 to ensure the color conversion happens before the crop,
+    # matching the behavior of the torchcodec filtergraph implementation.
+    FRAMES = [0, 15, 200, 389]
+    crop_filter = "crop=300:200:50:35:exact=1"
+    for frame in FRAMES:
+        output_bmp = f"{VIDEO_PATH}.{crop_filter}.stream3.frame{frame:06d}.bmp"
+        get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=3, filters=crop_filter)
+        convert_image_to_tensor(output_bmp)
+
+
+def generate_h265_video_references():
     # This video was generated by running the following:
     # conda install -c conda-forge x265
     # ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265  --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
@@ -107,6 +130,8 @@ def main():
         get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0)
         convert_image_to_tensor(output_bmp)
 
+
+def generate_av1_video_references():
     # This video was generated by running the following:
     # ffmpeg -f lavfi -i testsrc=duration=5:size=640x360:rate=25,format=yuv420p -c:v libaom-av1 -crf 30 -colorspace bt709 -color_primaries bt709 -color_trc bt709 av1_video.mkv
     # Note that this video only has 1 stream, at index 0.
@@ -119,5 +144,11 @@ def main():
         convert_image_to_tensor(output_bmp)
 
 
+def main():
+    generate_nasa_13013_references()
+    generate_h265_video_references()
+    generate_av1_video_references()
+
+
 if __name__ == "__main__":
     main()
diff --git a/test/resources/nasa_13013.mp4.crop=300:200:50:35:exact=1.stream3.frame000000.pt b/test/resources/nasa_13013.mp4.crop=300:200:50:35:exact=1.stream3.frame000000.pt
diff --git a/test/resources/nasa_13013.mp4.crop=300:200:50:35:exact=1.stream3.frame000015.pt b/test/resources/nasa_13013.mp4.crop=300:200:50:35:exact=1.stream3.frame000015.pt
diff --git a/test/resources/nasa_13013.mp4.crop=300:200:50:35:exact=1.stream3.frame000200.pt b/test/resources/nasa_13013.mp4.crop=300:200:50:35:exact=1.stream3.frame000200.pt
diff --git a/test/resources/nasa_13013.mp4.crop=300:200:50:35:exact=1.stream3.frame000389.pt b/test/resources/nasa_13013.mp4.crop=300:200:50:35:exact=1.stream3.frame000389.pt