meta-pytorch · scotts · Oct 17, 2025 · Oct 10, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/.github/workflows/linux_wheel.yaml b/.github/workflows/linux_wheel.yaml
@@ -85,7 +85,7 @@ jobs:
         run: python -m pip install --upgrade pip
       - name: Install PyTorch
         run: |
-          python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
       - name: Install torchcodec from the wheel
         run: |
           wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`

diff --git a/.github/workflows/macos_wheel.yaml b/.github/workflows/macos_wheel.yaml
@@ -86,7 +86,7 @@ jobs:
 
       - name: Install PyTorch
         run: |
-          python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
 
       - name: Install torchcodec from the wheel
         run: |

diff --git a/.github/workflows/reference_resources.yaml b/.github/workflows/reference_resources.yaml
@@ -14,14 +14,51 @@ defaults:
     shell: bash -l -eo pipefail {0}
 
 jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-xpu: disable
+      with-rocm: disable
+      with-cuda: disable
+      build-python-only: "disable"
+
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+    name: Build and Upload Linux wheel
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: meta-pytorch/torchcodec
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: packaging/pre_build_script.sh
+      post-script: packaging/post_build_script.sh
+      smoke-test-script: packaging/fake_smoke_test.py
+      package-name: torchcodec
+      trigger-event: ${{ github.event_name }}
+      build-platform: "python-build-package"
+      build-command: "BUILD_AGAINST_ALL_FFMPEG_FROM_S3=1 python -m build --wheel -vvv --no-isolation"
+
   test-reference-resource-generation:
+    needs: build
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         python-version: ['3.10']
         ffmpeg-version-for-tests: ['4.4.2', '5.1.2', '6.1.1', '7.0.1']
     steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: meta-pytorch_torchcodec__${{ matrix.python-version }}_cpu_x86_64
+          path: pytorch/torchcodec/dist/
       - name: Setup conda env
         uses: conda-incubator/setup-miniconda@v2
         with:
@@ -43,11 +80,16 @@ jobs:
           # Note that we're installing stable - this is for running a script where we're a normal PyTorch
           # user, not for building TorhCodec.
           python -m pip install torch --index-url https://download.pytorch.org/whl/cpu
-          python -m pip install numpy pillow
+          python -m pip install numpy pillow pytest
 
+      - name: Install torchcodec from the wheel
+        run: |
+          wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
+          echo Installing $wheel_path
+          python -m pip install $wheel_path -vvv
       - name: Check out repo
         uses: actions/checkout@v3
 
       - name: Run generation reference resources
         run: |
-          python test/generate_reference_resources.py
+          python -m test.generate_reference_resources
diff --git a/.github/workflows/windows_wheel.yaml b/.github/workflows/windows_wheel.yaml
@@ -90,7 +90,7 @@ jobs:
         run: python -m pip install --upgrade pip
       - name: Install PyTorch
         run: |
-          python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
       - name: Install torchcodec from the wheel
         run: |
           wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`

diff --git a/src/torchcodec/_core/FilterGraph.cpp b/src/torchcodec/_core/FilterGraph.cpp
@@ -130,7 +130,8 @@ FilterGraph::FilterGraph(
   TORCH_CHECK(
       status >= 0,
       "Failed to configure filter graph: ",
-      getFFMPEGErrorStringFromErrorCode(status));
+      getFFMPEGErrorStringFromErrorCode(status),
+      ", provided filters: " + filtersContext.filtergraphStr);
 }
 
 UniqueAVFrame FilterGraph::convert(const UniqueAVFrame& avFrame) {

diff --git a/src/torchcodec/_core/Frame.cpp b/src/torchcodec/_core/Frame.cpp
@@ -8,6 +8,11 @@
 
 namespace facebook::torchcodec {
 
+FrameDims::FrameDims(int height, int width) : height(height), width(width) {
+  TORCH_CHECK(height > 0, "FrameDims.height must be > 0, got: ", height);
+  TORCH_CHECK(width > 0, "FrameDims.width must be > 0, got: ", width);
+}
+
 FrameBatchOutput::FrameBatchOutput(
     int64_t numFrames,
     const FrameDims& outputDims,

diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h
@@ -19,7 +19,7 @@ struct FrameDims {
 
   FrameDims() = default;
 
-  FrameDims(int h, int w) : height(h), width(w) {}
+  FrameDims(int h, int w);
 };
 
 // All public video decoding entry points return either a FrameOutput or a

diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -12,6 +12,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <string_view>
+#include "Metadata.h"
 #include "torch/types.h"
 
 namespace facebook::torchcodec {
@@ -527,6 +528,7 @@ void SingleStreamDecoder::addVideoStream(
     if (transform->getOutputFrameDims().has_value()) {
       resizedOutputDims_ = transform->getOutputFrameDims().value();
     }
+    transform->validate(streamMetadata);
 
     // Note that we are claiming ownership of the transform objects passed in to
     // us.

diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
@@ -57,4 +57,31 @@ int ResizeTransform::getSwsFlags() const {
   return toSwsInterpolation(interpolationMode_);
 }
 
+CropTransform::CropTransform(const FrameDims& dims, int x, int y)
+    : outputDims_(dims), x_(x), y_(y) {
+  TORCH_CHECK(x_ >= 0, "Crop x position must be >= 0, got: ", x_);
+  TORCH_CHECK(y_ >= 0, "Crop y position must be >= 0, got: ", y_);
+}
+
+std::string CropTransform::getFilterGraphCpu() const {
+  return "crop=" + std::to_string(outputDims_.width) + ":" +
+      std::to_string(outputDims_.height) + ":" + std::to_string(x_) + ":" +
+      std::to_string(y_) + ":exact=1";
+}
+
+std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
+  return outputDims_;
+}
+
+void CropTransform::validate(const StreamMetadata& streamMetadata) const {
+  TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds");
+  TORCH_CHECK(
+      x_ + outputDims_.width <= streamMetadata.width,
+      "Crop x position out of bounds")
+  TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds");
+  TORCH_CHECK(
+      y_ + outputDims_.height <= streamMetadata.height,
+      "Crop y position out of bounds");
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Transform.h b/src/torchcodec/_core/Transform.h
@@ -9,6 +9,7 @@
 #include <optional>
 #include <string>
 #include "src/torchcodec/_core/Frame.h"
+#include "src/torchcodec/_core/Metadata.h"
 
 namespace facebook::torchcodec {
 
@@ -33,6 +34,16 @@ class Transform {
   virtual bool isResize() const {
     return false;
   }
+
+  // The validity of some transforms depends on the characteristics of the
+  // AVStream they're being applied to. For example, some transforms will
+  // specify coordinates inside a frame, we need to validate that those are
+  // within the frame's bounds.
+  //
+  // Note that the validation function does not return anything. We expect
+  // invalid configurations to throw an exception.
+  virtual void validate(
+      [[maybe_unused]] const StreamMetadata& streamMetadata) const {}
 };
 
 class ResizeTransform : public Transform {
@@ -56,4 +67,18 @@ class ResizeTransform : public Transform {
   InterpolationMode interpolationMode_;
 };
 
+class CropTransform : public Transform {
+ public:
+  CropTransform(const FrameDims& dims, int x, int y);
+
+  std::string getFilterGraphCpu() const override;
+  std::optional<FrameDims> getOutputFrameDims() const override;
+  void validate(const StreamMetadata& streamMetadata) const override;
+
+ private:
+  FrameDims outputDims_;
+  int x_;
+  int y_;
+};
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -212,6 +212,26 @@ Transform* makeResizeTransform(
   return new ResizeTransform(FrameDims(height, width));
 }
 
+// Crop transform specs take the form:
+//
+//   "crop, <height>, <width>, <x>, <y>"
+//
+// Where "crop" is the string literal and <height>, <width>, <x> and <y> are
+// positive integers. <x> and <y> are the x and y coordinates of the top left
+// corner of the crop. Note that we follow the PyTorch convention of (height,
+// width) for specifying image dimensions; FFmpeg uses (width, height).
+Transform* makeCropTransform(
+    const std::vector<std::string>& cropTransformSpec) {
+  TORCH_CHECK(
+      cropTransformSpec.size() == 5,
+      "cropTransformSpec must have 5 elements including its name");
+  int height = checkedToPositiveInt(cropTransformSpec[1]);
+  int width = checkedToPositiveInt(cropTransformSpec[2]);
+  int x = checkedToPositiveInt(cropTransformSpec[3]);
+  int y = checkedToPositiveInt(cropTransformSpec[4]);
+  return new CropTransform(FrameDims(height, width), x, y);
+}
+
 std::vector<std::string> split(const std::string& str, char delimiter) {
   std::vector<std::string> tokens;
   std::string token;
@@ -239,6 +259,8 @@ std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
     auto name = transformSpec[0];
     if (name == "resize") {
       transforms.push_back(makeResizeTransform(transformSpec));
+    } else if (name == "crop") {
+      transforms.push_back(makeCropTransform(transformSpec));
     } else {
       TORCH_CHECK(false, "Invalid transform name: " + name);
     }

diff --git a/test/generate_reference_resources.py b/test/generate_reference_resources.py
@@ -12,9 +12,15 @@
 import torch
 from PIL import Image
 
+from .utils import sanitize_filtergraph_expression
+
 # Run this script to update the resources used in unit tests. The resources are all derived
 # from source media already checked into the repo.
 
+SCRIPT_DIR = Path(__file__).resolve().parent
+TORCHCODEC_PATH = SCRIPT_DIR.parent
+RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"
+
 
 def convert_image_to_tensor(image_path):
     image_path = Path(image_path)
@@ -31,7 +37,18 @@ def convert_image_to_tensor(image_path):
     image_path.unlink()
 
 
-def get_frame_by_index(video_path, frame, output_path, stream):
+def get_frame_by_index(video_path, frame, output_path, stream, filters=None):
+    # Note that we have an exlicit format conversion to rgb24 in our filtergraph specification,
+    # which always happens BEFORE any of the filters that we receive as input. We do this to
+    # ensure that the color conversion happens BEFORE the filters, matching the behavior of the
+    # torchcodec filtergraph implementation.
+    #
+    # Not doing this would result in the color conversion happening AFTER the filters, which
+    # would result in different color values for the same frame.
+    filtergraph = f"select='eq(n\\,{frame})',format=rgb24"
+    if filters is not None:
+        filtergraph = filtergraph + f",{filters}"
+
     cmd = [
         "ffmpeg",
         "-y",
@@ -40,11 +57,11 @@ def get_frame_by_index(video_path, frame, output_path, stream):
         "-map",
         f"0:{stream}",
         "-vf",
-        f"select=eq(n\\,{frame})",
-        "-vsync",
-        "vfr",
-        "-q:v",
-        "2",
+        filtergraph,
+        "-fps_mode",
+        "passthrough",
+        "-update",
+        "1",
         output_path,
     ]
     subprocess.run(cmd, check=True)
@@ -65,14 +82,9 @@ def get_frame_by_timestamp(video_path, timestamp, output_path):
     subprocess.run(cmd, check=True)
 
 
-def main():
-    SCRIPT_DIR = Path(__file__).resolve().parent
-    TORCHCODEC_PATH = SCRIPT_DIR.parent
-    RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"
+def generate_nasa_13013_references():
     VIDEO_PATH = RESOURCES_DIR / "nasa_13013.mp4"
 
-    # Last generated with ffmpeg version 4.3
-    #
     # Note: The naming scheme used here must match the naming scheme used to load
     # tensors in ./utils.py.
     STREAMS = [0, 3]
@@ -95,6 +107,16 @@ def main():
         get_frame_by_timestamp(VIDEO_PATH, timestamp, output_bmp)
         convert_image_to_tensor(output_bmp)
 
+    # Extract frames with specific filters. We have tests that assume these exact filters.
+    FRAMES = [0, 15, 200, 389]
+    crop_filter = "crop=300:200:50:35:exact=1"
+    for frame in FRAMES:
+        output_bmp = f"{VIDEO_PATH}.{sanitize_filtergraph_expression(crop_filter)}.stream3.frame{frame:06d}.bmp"
+        get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=3, filters=crop_filter)
+        convert_image_to_tensor(output_bmp)
+
+
+def generate_h265_video_references():
     # This video was generated by running the following:
     # conda install -c conda-forge x265
     # ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265  --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
@@ -107,6 +129,8 @@ def main():
         get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0)
         convert_image_to_tensor(output_bmp)
 
+
+def generate_av1_video_references():
     # This video was generated by running the following:
     # ffmpeg -f lavfi -i testsrc=duration=5:size=640x360:rate=25,format=yuv420p -c:v libaom-av1 -crf 30 -colorspace bt709 -color_primaries bt709 -color_trc bt709 av1_video.mkv
     # Note that this video only has 1 stream, at index 0.
@@ -119,5 +143,11 @@ def main():
         convert_image_to_tensor(output_bmp)
 
 
+def main():
+    generate_nasa_13013_references()
+    generate_h265_video_references()
+    generate_av1_video_references()
+
+
 if __name__ == "__main__":
     main()
diff --git a/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000000.pt b/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000000.pt
diff --git a/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000015.pt b/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000015.pt
diff --git a/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000200.pt b/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000200.pt
diff --git a/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000389.pt b/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000389.pt