More comments, add pytest to reference resources

scotts · scotts · commit a15d45862f92 · 2025-10-16T18:41:11.000-07:00
diff --git a/.github/workflows/reference_resources.yaml b/.github/workflows/reference_resources.yaml
@@ -43,7 +43,7 @@ jobs:
           # Note that we're installing stable - this is for running a script where we're a normal PyTorch
           # user, not for building TorhCodec.
           python -m pip install torch --index-url https://download.pytorch.org/whl/cpu
-          python -m pip install numpy pillow
+          python -m pip install numpy pillow pytest
 
       - name: Check out repo
         uses: actions/checkout@v3
diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
@@ -75,7 +75,9 @@ std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
 
 void CropTransform::validate(const StreamMetadata& streamMetadata) const {
   TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds");
+  TORCH_CHECK(x_ + outputDims_.width <= streamMetadata.width, "Crop x position out of bounds")
   TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds");
+  TORCH_CHECK(y_ + outputDims_.height <= streamMetadata.height, "Crop y position out of bounds");
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -214,12 +214,12 @@ Transform* makeResizeTransform(
 
 // Crop transform specs take the form:
 //
-//   "crop, <height>, <width> <x>, <y>"
+//   "crop, <height>, <width>, <x>, <y>"
 //
 // Where "crop" is the string literal and <height>, <width>, <x> and <y> are
-// positive integers. Note that that in this spec, we are following the
-// filtergraph convention of (width, height). This makes it easier to compare it
-// against actual filtergraph strings.
+// positive integers. <x> and <y> are the x and y coordinates of the top left
+// corner of the crop. Note that we follow the PyTorch convention of (height,
+// width) for specifying image dimensions; FFmpeg uses (width, height).
 Transform* makeCropTransform(
     const std::vector<std::string>& cropTransformSpec) {
   TORCH_CHECK(
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py
@@ -7,7 +7,6 @@
 import contextlib
 import os
 
-os.environ["TORCH_LOGS"] = "output_code"
 import json
 import subprocess
 
@@ -210,9 +209,13 @@ def test_resize_transform_fails(self):
 
     def test_crop_transform(self):
         # Note that filtergraph accepts dimensions as (w, h) and we accept them as (h, w).
-        crop_spec = "crop, 200, 300, 50, 35"  # h=200, w=300, x=50, y=35
-        crop_filtergraph = "crop=300:200:50:35:exact=1"  # w=300, h=200, x=50, y=35
-        expected_shape = (3, 200, 300)  # channels=3, height=200, width=300
+        width = 300
+        height = 200
+        x = 50
+        y = 35
+        crop_spec = f"crop, {height}, {width}, {x}, {y}"
+        crop_filtergraph = f"crop={width}:{height}:{x}:{y}:exact=1"
+        expected_shape = (NASA_VIDEO.get_num_color_channels(), height, width)
 
         decoder_crop = create_from_file(str(NASA_VIDEO.path))
         add_video_stream(decoder_crop, transform_specs=crop_spec)
@@ -228,7 +231,7 @@ def test_crop_transform(self):
 
             frame_full, *_ = get_frame_at_index(decoder_full, frame_index=frame_index)
             frame_tv = v2.functional.crop(
-                frame_full, top=35, left=50, height=200, width=300
+                frame_full, top=y, left=x, height=height, width=width
             )
 
             assert frame.shape == expected_shape
@@ -239,28 +242,38 @@ def test_crop_transform(self):
             assert_frames_equal(frame, frame_ref)
 
     def test_crop_transform_fails(self):
-        decoder = create_from_file(str(NASA_VIDEO.path))
 
         with pytest.raises(
             RuntimeError,
             match="must have 5 elements",
         ):
+            decoder = create_from_file(str(NASA_VIDEO.path))
             add_video_stream(decoder, transform_specs="crop, 100, 100")
 
         with pytest.raises(
             RuntimeError,
             match="must be a positive integer",
         ):
+            decoder = create_from_file(str(NASA_VIDEO.path))
             add_video_stream(decoder, transform_specs="crop, -10, 100, 100, 100")
 
         with pytest.raises(
             RuntimeError,
             match="cannot be converted to an int",
         ):
+            decoder = create_from_file(str(NASA_VIDEO.path))
             add_video_stream(decoder, transform_specs="crop, 100, 100, blah, 100")
 
         with pytest.raises(
             RuntimeError,
             match="x position out of bounds",
         ):
+            decoder = create_from_file(str(NASA_VIDEO.path))
             add_video_stream(decoder, transform_specs="crop, 100, 100, 9999, 100")
+
+        with pytest.raises(
+            RuntimeError,
+            match="y position out of bounds",
+        ):
+            decoder = create_from_file(str(NASA_VIDEO.path))
+            add_video_stream(decoder, transform_specs="crop, 999, 100, 100, 100")

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,9 @@ std::optional<FrameDims> CropTransform::getOutputFrameDims() const {`
`75`	`75`
`76`	`76`	`void CropTransform::validate(const StreamMetadata& streamMetadata) const {`
`77`	`77`	`TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds");`
	`78`	`+ TORCH_CHECK(x_ + outputDims_.width <= streamMetadata.width, "Crop x position out of bounds")`
`78`	`79`	`TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds");`
	`80`	`+ TORCH_CHECK(y_ + outputDims_.height <= streamMetadata.height, "Crop y position out of bounds");`
`79`	`81`	`}`
`80`	`82`
`81`	`83`	`} // namespace facebook::torchcodec`