More involved testing

scotts · scotts · commit 4245bddd34b0 · 2025-11-05T06:57:27.000-08:00
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -69,10 +69,18 @@ void CpuDeviceInterface::initializeVideo(
     first = false;
   }
   if (!transforms.empty()) {
-    // Note that we ensure that the transforms come AFTER the format conversion.
-    // This means that the transforms are applied in the output pixel format and
-    // colorspace.
-    filters_ += "," + filters.str();
+    // Note [Transform and Format Conversion Order]
+    // We have to ensure that all user filters happen AFTER the explicit format
+    // conversion. That is, we want the filters to be applied in RGB24, not the
+    // pixel format of the input frame.
+    //
+    // The ouput frame will always be in RGB24, as we specify the sink node with
+    // AV_PIX_FORMAT_RGB24. Filtergraph will automatically insert a filter
+    // conversion to ensure the output frame matches the pixel format
+    // specified in the sink. But by default, it will insert it after the user
+    // filters. We need an explicit format conversion to get the behavior we
+    // want.
+    filters_ = "format=rgb24," + filters.str();
   }
 
   initialized_ = true;
@@ -233,9 +241,14 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
     swsContext_ = createSwsContext(
         swsFrameContext,
         avFrame->colorspace,
+
+        // See [Transform and Format Conversion Order] for more on the output
+        // pixel format.
         /*outputFormat=*/AV_PIX_FMT_RGB24,
-        /*swsFlags=*/0); // We don't set any flags because we don't yet use
-                         // sws_scale() for resizing.
+
+        // We don't set any flags because we don't yet use sw_scale() for
+        // resizing.
+        /*swsFlags=*/0);
     prevSwsFrameContext_ = swsFrameContext;
   }
 
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -93,19 +93,13 @@ class CpuDeviceInterface : public DeviceInterface {
   // initialization, we convert the user-supplied transforms into this string of
   // filters.
   //
-  // Note that we start with just the format conversion, and then we ensure that
-  // the user-supplied filters always happen AFTER the format conversion. We
-  // want the user-supplied filters to operate on frames in the output pixel
-  // format and colorspace.
+  // Note that if there are no user-supplied transforms, then the default filter
+  // we use is the copy filter, which is just an identity: it emits the output
+  // frame unchanged. We supply such a filter because we can't supply just the
+  // empty-string; we must supply SOME filter.
   //
-  // We apply the transforms on the output pixel format and colorspace because
-  // then decoder-native transforms are as close as possible to returning
-  // untransformed frames and applying TochVision transforms to them.
-  //
-  // We ensure that the transforms happen on the output pixel format and
-  // colorspace by making sure all of the user-supplied filters happen AFTER
-  // an explicit format conversion.
-  std::string filters_ = "format=rgb24";
+  // See also [Tranform and Format Conversion Order] for more on filters.
+  std::string filters_ = "copy";
 
   // Values set during initialization and referred to in
   // getColorConversionLibrary().
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py
@@ -31,19 +31,25 @@
     H265_VIDEO,
     NASA_VIDEO,
     needs_cuda,
+    TEST_SRC_2_720P,
 )
 
 torch._dynamo.config.capture_dynamic_output_shape_ops = True
 
 
 class TestCoreVideoDecoderTransformOps:
-    @pytest.mark.parametrize("video", [NASA_VIDEO, H265_VIDEO, AV1_VIDEO])
-    def test_color_conversion_library(self, video):
+    def get_num_frames_core_ops(self, video):
         decoder = create_from_file(str(video.path))
         add_video_stream(decoder)
         metadata = get_json_metadata(decoder)
         metadata_dict = json.loads(metadata)
         num_frames = metadata_dict["numFramesFromHeader"]
+        assert num_frames is not None
+        return num_frames
+
+    @pytest.mark.parametrize("video", [NASA_VIDEO, H265_VIDEO, AV1_VIDEO])
+    def test_color_conversion_library(self, video):
+        num_frames = self.get_num_frames_core_ops(video)
 
         filtergraph_decoder = create_from_file(str(video.path))
         _add_video_stream(
@@ -170,32 +176,63 @@ def test_transform_fails(self):
         "height_scaling_factor, width_scaling_factor",
         ((1.5, 1.31), (0.5, 0.71), (0.7, 1.31), (1.5, 0.71), (1.0, 1.0), (2.0, 2.0)),
     )
-    def test_resize_torchvision(self, height_scaling_factor, width_scaling_factor):
-        height = int(NASA_VIDEO.get_height() * height_scaling_factor)
-        width = int(NASA_VIDEO.get_width() * width_scaling_factor)
+    @pytest.mark.parametrize("video", [NASA_VIDEO, TEST_SRC_2_720P])
+    def test_resize_torchvision(
+        self, video, height_scaling_factor, width_scaling_factor
+    ):
+        num_frames = self.get_num_frames_core_ops(video)
+
+        height = int(video.get_height() * height_scaling_factor)
+        width = int(video.get_width() * width_scaling_factor)
         resize_spec = f"resize, {height}, {width}"
 
-        decoder_resize = create_from_file(str(NASA_VIDEO.path))
+        decoder_resize = create_from_file(str(video.path))
         add_video_stream(decoder_resize, transform_specs=resize_spec)
 
-        decoder_full = create_from_file(str(NASA_VIDEO.path))
+        decoder_full = create_from_file(str(video.path))
         add_video_stream(decoder_full)
 
-        for frame_index in [0, 10, 17, 100, 230, 389]:
-            expected_shape = (NASA_VIDEO.get_num_color_channels(), height, width)
+        for frame_index in [
+            0,
+            int(num_frames * 0.1),
+            int(num_frames * 0.2),
+            int(num_frames * 0.3),
+            int(num_frames * 0.4),
+            int(num_frames * 0.5),
+            int(num_frames * 0.75),
+            int(num_frames * 0.90),
+            num_frames - 1,
+        ]:
+            expected_shape = (video.get_num_color_channels(), height, width)
             frame_resize, *_ = get_frame_at_index(
                 decoder_resize, frame_index=frame_index
             )
 
             frame_full, *_ = get_frame_at_index(decoder_full, frame_index=frame_index)
             frame_tv = v2.functional.resize(frame_full, size=(height, width))
+            frame_tv_no_antialias = v2.functional.resize(
+                frame_full, size=(height, width), antialias=False
+            )
 
             assert frame_resize.shape == expected_shape
             assert frame_tv.shape == expected_shape
+            assert frame_tv_no_antialias.shape == expected_shape
 
             assert_tensor_close_on_at_least(
-                frame_resize, frame_tv, percentage=99, atol=1
+                frame_resize, frame_tv, percentage=99.9, atol=1
             )
+            torch.testing.assert_close(frame_resize, frame_tv, rtol=0, atol=6)
+
+            if height_scaling_factor < 1 or width_scaling_factor < 1:
+                # Antialias only relevant when down-scaling!
+                with pytest.raises(AssertionError, match="Expected at least"):
+                    assert_tensor_close_on_at_least(
+                        frame_resize, frame_tv_no_antialias, percentage=99, atol=1
+                    )
+                with pytest.raises(AssertionError, match="Tensor-likes are not close"):
+                    torch.testing.assert_close(
+                        frame_resize, frame_tv_no_antialias, rtol=0, atol=6
+                    )
 
     def test_resize_ffmpeg(self):
         height = 135