Add MPEG4 support

NicolasHug · NicolasHug · commit 490c13b04622 · 2025-10-04T14:46:07.000+01:00
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -154,8 +154,9 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
       return cudaVideoCodec_VP9;
     case AV_CODEC_ID_VP8:
       return cudaVideoCodec_VP8;
+    case AV_CODEC_ID_MPEG4:
+      return cudaVideoCodec_MPEG4;
     // TODONVDEC P0: support more codecs
-    // case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4;
     // case AV_CODEC_ID_MJPEG: return cudaVideoCodec_JPEG;
     default: {
       TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
@@ -272,6 +273,14 @@ void BetaCudaDeviceInterface::initializeBSF(
       }
       break;
     }
+    case AV_CODEC_ID_MPEG4: {
+      const std::string formatName =
+          avFormatCtx->iformat->name ? avFormatCtx->iformat->name : "";
+      if (formatName == "avi") {
+        filterName = "mpeg4_unpack_bframes";
+      }
+      break;
+    }
 
     default:
       // No bitstream filter needed for other codecs
@@ -514,19 +523,15 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
   avFrame->format = AV_PIX_FMT_CUDA;
   avFrame->pts = dispInfo.timestamp;
 
-  // TODONVDEC P2: We compute the duration based on average frame rate info:
-  // either from NVCUVID if it's valid, otherwise from FFmpeg as fallback. But
-  // both of these are based on average frame rate, so if the video has
-  // variable frame rate, the durations may be off. We should try to see if we
-  // can set the duration more accurately. Unfortunately it's not given by
-  // dispInfo. One option would be to set it based on the pts difference between
-  // consecutive frames, if the next frame is already available.
-  int frameRateNum = static_cast<int>(videoFormat_.frame_rate.numerator);
-  int frameRateDen = static_cast<int>(videoFormat_.frame_rate.denominator);
-  AVRational frameRate = (frameRateNum > 0 && frameRateDen > 0)
-      ? AVRational{frameRateNum, frameRateDen}
-      : frameRateAvgFromFFmpeg_;
-  setDuration(avFrame, computeSafeDuration(frameRate, timeBase_));
+  // TODONVDEC P2: We compute the duration based on average frame rate info, so
+  // so if the video has variable frame rate, the durations may be off. We
+  // should try to see if we can set the duration more accurately. Unfortunately
+  // it's not given by dispInfo. One option would be to set it based on the pts
+  // difference between consecutive frames, if the next frame is already
+  // available.
+  // Note that we used to rely on videoFormat_.frame_rate for this, but that
+  // proved less accurate than FFmpeg.
+  setDuration(avFrame, computeSafeDuration(frameRateAvgFromFFmpeg_, timeBase_));
 
   // We need to assign the frame colorspace. This is crucial for proper color
   // conversion. NVCUVID stores that in the matrix_coefficients field, but
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -472,6 +472,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
 
   // For background, see
   // Note [YUV -> RGB Color Conversion, color space and color range]
+  // if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
   if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
     if (avFrame->color_range == AVColorRange::AVCOL_RANGE_JPEG) {
       // NPP provides a pre-defined color conversion function for BT.709 full
diff --git a/test/resources/testsrc2_mpeg4.avi b/test/resources/testsrc2_mpeg4.avi
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -45,6 +45,7 @@
     SINE_MONO_S32_8000,
     TEST_SRC_2_720P,
     TEST_SRC_2_720P_H265,
+    TEST_SRC_2_720P_MPEG4,
     TEST_SRC_2_720P_VP8,
     TEST_SRC_2_720P_VP9,
     unsplit_device_str,
@@ -1434,15 +1435,20 @@ def test_get_frames_at_tensor_indices(self):
         decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.int))
         decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.float))
 
-    # TODONVDEC P1 unskip equality assertion checks on FFMpeg4. The comparison
-    # checks are failing on very few pixels, e.g.:
+    # TODONVDEC P1:
+    # - unskip equality assertion checks on FFMpeg4. The comparison
+    #   checks are failing on very few pixels, e.g.:
     #
-    # E   Mismatched elements: 648586 / 82944000 (0.8%)
-    # E   Greatest absolute difference: 164 at index (20, 2, 27, 96)
-    # E   Greatest relative difference: inf at index (5, 1, 112, 186)
+    #   E   Mismatched elements: 648586 / 82944000 (0.8%)
+    #   E   Greatest absolute difference: 164 at index (20, 2, 27, 96)
+    #   E   Greatest relative difference: inf at index (5, 1, 112, 186)
     #
-    # So we're skipping them to unblock for now, but we should call
-    # assert_tensor_close_on_at_least or something like that.
+    #   So we're skipping them to unblock for now, but we should call
+    #   assert_tensor_close_on_at_least or something like that.
+    # - unskip equality assertion checks for MPEG4 asset. The frames are decoded
+    #   fine, it's the color conversion that's different. The frame from the
+    #   BETA interface is assumed to be 701 while the one from the default
+    #   interface is 601.
 
     @needs_cuda
     @pytest.mark.parametrize(
@@ -1455,6 +1461,7 @@ def test_get_frames_at_tensor_indices(self):
             AV1_VIDEO,
             TEST_SRC_2_720P_VP9,
             TEST_SRC_2_720P_VP8,
+            TEST_SRC_2_720P_MPEG4,
         ),
     )
     @pytest.mark.parametrize("contiguous_indices", (True, False))
@@ -1483,7 +1490,15 @@ def test_beta_cuda_interface_get_frame_at(
         for frame_index in indices:
             ref_frame = ref_decoder.get_frame_at(frame_index)
             beta_frame = beta_decoder.get_frame_at(frame_index)
-            if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+            if asset == TEST_SRC_2_720P_MPEG4:
+                from torchvision.io import write_png
+                from torchvision.utils import make_grid
+
+                img = make_grid([beta_frame.data, ref_frame.data], nrow=2)
+                write_png(img.cpu(), f"/tmp/frame_{frame_index:04d}.png")
+
+            # TODONVDEC P1 see above
+            if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
                 torch.testing.assert_close(
                     beta_frame.data, ref_frame.data, rtol=0, atol=0
                 )
@@ -1502,6 +1517,7 @@ def test_beta_cuda_interface_get_frame_at(
             AV1_VIDEO,
             TEST_SRC_2_720P_VP9,
             TEST_SRC_2_720P_VP8,
+            TEST_SRC_2_720P_MPEG4,
         ),
     )
     @pytest.mark.parametrize("contiguous_indices", (True, False))
@@ -1530,7 +1546,8 @@ def test_beta_cuda_interface_get_frames_at(
 
         ref_frames = ref_decoder.get_frames_at(indices)
         beta_frames = beta_decoder.get_frames_at(indices)
-        if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+        # TODONVDEC P1 see above
+        if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
             torch.testing.assert_close(
                 beta_frames.data, ref_frames.data, rtol=0, atol=0
             )
@@ -1550,6 +1567,7 @@ def test_beta_cuda_interface_get_frames_at(
             AV1_VIDEO,
             TEST_SRC_2_720P_VP9,
             TEST_SRC_2_720P_VP8,
+            TEST_SRC_2_720P_MPEG4,
         ),
     )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
@@ -1573,7 +1591,8 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
         for pts in timestamps:
             ref_frame = ref_decoder.get_frame_played_at(pts)
             beta_frame = beta_decoder.get_frame_played_at(pts)
-            if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+            # TODONVDEC P1 see above
+            if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
                 torch.testing.assert_close(
                     beta_frame.data, ref_frame.data, rtol=0, atol=0
                 )
@@ -1589,9 +1608,10 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
             TEST_SRC_2_720P,
             BT709_FULL_RANGE,
             TEST_SRC_2_720P_H265,
+            AV1_VIDEO,
             TEST_SRC_2_720P_VP9,
             TEST_SRC_2_720P_VP8,
-            AV1_VIDEO,
+            TEST_SRC_2_720P_MPEG4,
         ),
     )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
@@ -1615,7 +1635,8 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
 
         ref_frames = ref_decoder.get_frames_played_at(timestamps)
         beta_frames = beta_decoder.get_frames_played_at(timestamps)
-        if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+        # TODONVDEC P1 see above
+        if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
             torch.testing.assert_close(
                 beta_frames.data, ref_frames.data, rtol=0, atol=0
             )
@@ -1635,6 +1656,7 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
             AV1_VIDEO,
             TEST_SRC_2_720P_VP9,
             TEST_SRC_2_720P_VP8,
+            TEST_SRC_2_720P_MPEG4,
         ),
     )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
@@ -1662,7 +1684,8 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode):
 
             ref_frame = ref_decoder.get_frame_at(frame_index)
             beta_frame = beta_decoder.get_frame_at(frame_index)
-            if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+            # TODONVDEC P1 see above
+            if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
                 torch.testing.assert_close(
                     beta_frame.data, ref_frame.data, rtol=0, atol=0
                 )
diff --git a/test/utils.py b/test/utils.py
@@ -737,3 +737,13 @@ def sample_format(self) -> str:
     },
     frames={0: {}},  # Not needed for now
 )
+
+# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v mpeg4 -q:v 5 output_mpeg4.avi
+TEST_SRC_2_720P_MPEG4 = TestVideo(
+    filename="testsrc2_mpeg4.avi",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)