compare against cli, high % match

Daniel Flores · Daniel Flores · commit 206d4e4dd010 · 2025-10-06T11:27:23.000-04:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -4,6 +4,10 @@
 #include "src/torchcodec/_core/Encoder.h"
 #include "torch/types.h"
 
+extern "C" {
+#include <libavutil/pixdesc.h>
+}
+
 namespace facebook::torchcodec {
 
 namespace {
@@ -579,6 +583,7 @@ VideoEncoder::VideoEncoder(
 
 void VideoEncoder::initializeEncoder(
     const VideoStreamOptions& videoStreamOptions) {
+  av_log_set_level(AV_LOG_VERBOSE);
   const AVCodec* avCodec =
       avcodec_find_encoder(avFormatContext_->oformat->video_codec);
   TORCH_CHECK(avCodec != nullptr, "Video codec not found");
@@ -625,12 +630,22 @@ void VideoEncoder::initializeEncoder(
 
   // Apply videoStreamOptions
   AVDictionary* options = nullptr;
-  if (videoStreamOptions.crf.has_value()) {
+  if (videoStreamOptions.crf.has_value() &&
+      (avCodec->id != AV_CODEC_ID_MPEG4 && avCodec->id != AV_CODEC_ID_FLV1)) {
     av_dict_set(
         &options,
         "crf",
         std::to_string(videoStreamOptions.crf.value()).c_str(),
         0);
+  } else {
+    // For codecs that don't support CRF (mpeg4, flv1),
+    // use quality-based encoding via global_quality + qscale flag
+    avCodecContext_->flags |= AV_CODEC_FLAG_QSCALE;
+    // While qscale is similar to crf, it is likely not interchangeable.
+    // Reuse of crf below is only intended to work in VideoEncoder tests where
+    // crf = 0
+    avCodecContext_->global_quality =
+        FF_QP2LAMBDA * videoStreamOptions.crf.value();
   }
   int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
   av_dict_free(&options);
@@ -694,7 +709,7 @@ UniqueAVFrame VideoEncoder::convertTensorToAVFrame(
         outWidth_,
         outHeight_,
         outPixelFormat_,
-        SWS_BILINEAR,
+        SWS_BICUBIC, // Used by FFmpeg CLI
         nullptr,
         nullptr,
         nullptr));
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -9,7 +9,13 @@
 import os
 from functools import partial
 
-from .utils import get_ffmpeg_major_version, in_fbcode, IS_WINDOWS
+from .utils import (
+    assert_tensor_close_on_at_least,
+    get_ffmpeg_major_version,
+    in_fbcode,
+    IS_WINDOWS,
+    TEST_SRC_2_720P,
+)
 
 os.environ["TORCH_LOGS"] = "output_code"
 import json
@@ -1341,7 +1347,7 @@ def decode(self, file_path) -> torch.Tensor:
         return frames
 
     @pytest.mark.parametrize("format", ("mov", "mp4", "avi", "mkv", "webm", "flv"))
-    def test_video_encoder_test_round_trip(self, tmp_path, format):
+    def test_video_encoder_round_trip(self, tmp_path, format):
         ffmpeg_version = get_ffmpeg_major_version()
         if format == "webm":
             if ffmpeg_version == 4:
@@ -1352,7 +1358,7 @@ def test_video_encoder_test_round_trip(self, tmp_path, format):
                 pytest.skip(
                     "Codec for webm is not available in the FFmpeg6/7 installation on Windows."
                 )
-        asset = NASA_VIDEO
+        asset = TEST_SRC_2_720P
         # Test that decode(encode(decode(asset))) == decode(asset)
         source_frames = self.decode(str(asset.path)).data
 
@@ -1362,20 +1368,93 @@ def test_video_encoder_test_round_trip(self, tmp_path, format):
             frames=source_frames, frame_rate=frame_rate, filename=encoded_path, crf=0
         )
         round_trip_frames = self.decode(encoded_path).data
-
-        # In the cases where a lossy pixel format conversion occurs, higher tolerance is needed.
-        # Converting to the output format may perform chroma subsampling.
-        # Other times, no conversion between YUV and RGB is required.
+        assert (
+            source_frames.shape == round_trip_frames.shape
+        ), f"Shape mismatch: source {source_frames.shape} vs round_trip {round_trip_frames.shape}"
+        assert (
+            source_frames.dtype == round_trip_frames.dtype
+        ), f"Dtype mismatch: source {source_frames.dtype} vs round_trip {round_trip_frames.dtype}"
+
+        # If FFmpeg selects a codec or pixel format that does lossy encoding, assert 99% of pixels
+        # are within a higher tolerance.
         if ffmpeg_version == 6 or format in ("avi", "flv"):
-            atol = 55
+            assert_close = partial(assert_tensor_close_on_at_least, percentage=99)
+            atol = 15
         else:
+            assert_close = torch.testing.assert_close
             atol = 2
-        # TODO-VideoEncoder: Test with FFmpeg's testsrc2 video
         # Check that PSNR for decode(encode(samples)) is above 30
         for s_frame, rt_frame in zip(source_frames, round_trip_frames):
             res = psnr(s_frame, rt_frame)
             assert res > 30
-            torch.testing.assert_close(s_frame, rt_frame, atol=atol, rtol=0)
+            assert_close(s_frame, rt_frame, atol=atol, rtol=0)
+
+    @pytest.mark.skipif(in_fbcode(), reason="ffmpeg CLI not available")
+    @pytest.mark.parametrize("format", ("mov", "mp4", "avi", "mkv", "webm", "flv"))
+    def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format):
+        ffmpeg_version = get_ffmpeg_major_version()
+        if format == "webm" and ffmpeg_version == 4:
+            pytest.skip("Codec for webm is not available in the FFmpeg4 installation.")
+        asset = TEST_SRC_2_720P
+        source_frames = self.decode(str(asset.path)).data
+        frame_rate = 30
+
+        # Encode with FFmpeg CLI
+        temp_raw_path = str(tmp_path / "temp_input.raw")
+        with open(temp_raw_path, "wb") as f:
+            f.write(source_frames.permute(0, 2, 3, 1).cpu().numpy().tobytes())
+
+        ffmpeg_encoded_path = str(tmp_path / f"ffmpeg_output.{format}")
+        # Test that lossless encoding is identical
+        crf = 0
+        quality_params = ["-crf", str(crf)]
+        # Some codecs (ex. MPEG4) do not support CRF, qscale is used for lossless encoding.
+        # Flags not supported by the selected codec will be ignored, so we set both crf and qscale.
+        quality_params += ["-q:v", str(crf)]
+        ffmpeg_cmd = [
+            "ffmpeg",
+            "-y",
+            "-f",
+            "rawvideo",
+            "-pix_fmt",
+            "rgb24",
+            "-s",
+            f"{source_frames.shape[3]}x{source_frames.shape[2]}",
+            "-r",
+            str(frame_rate),
+            "-i",
+            temp_raw_path,
+            *quality_params,
+            ffmpeg_encoded_path,
+        ]
+        subprocess.run(ffmpeg_cmd, check=True)
+
+        # Encode with our video encoder
+        encoder_output_path = str(tmp_path / f"encoder_output.{format}")
+        encode_video_to_file(
+            frames=source_frames,
+            frame_rate=frame_rate,
+            filename=encoder_output_path,
+            crf=crf,
+        )
+
+        ffmpeg_frames = self.decode(ffmpeg_encoded_path).data
+        encoder_frames = self.decode(encoder_output_path).data
+
+        assert ffmpeg_frames.shape[0] == encoder_frames.shape[0]
+
+        # If FFmpeg selects a codec or pixel format that uses qscale (not crf),
+        # the VideoEncoder outputs *slightly* different frames.
+        # There may be additional subtle differences in the encoder.
+        percentage = 97 if ffmpeg_version == 6 or format in ("avi") else 99
+
+        # Check that PSNR between both encoded versions is high
+        for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames):
+            res = psnr(ff_frame, enc_frame)
+            assert res > 30
+            assert_tensor_close_on_at_least(
+                ff_frame, enc_frame, percentage=percentage, atol=2
+            )
 
 
 if __name__ == "__main__":