meta-pytorch · NicolasHug · Oct 4, 2025 · Oct 4, 2025 · Oct 4, 2025 · Oct 4, 2025
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -150,11 +150,24 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
       return cudaVideoCodec_HEVC;
     case AV_CODEC_ID_AV1:
       return cudaVideoCodec_AV1;
-    // TODONVDEC P0: support more codecs
-    // case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4;
-    // case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8;
-    // case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9;
-    // case AV_CODEC_ID_MJPEG: return cudaVideoCodec_JPEG;
+    case AV_CODEC_ID_VP9:
+      return cudaVideoCodec_VP9;
+    case AV_CODEC_ID_VP8:
+      return cudaVideoCodec_VP8;
+    case AV_CODEC_ID_MPEG4:
+      return cudaVideoCodec_MPEG4;
+    // Formats below are currently not tested, but they should "mostly" work.
+    // MPEG1 was briefly locally tested and it was ok-ish despite duration being
+    // off. Since they're far less popular, we keep them disabled by default but
+    // we can consider enabling them upon user requests.
+    // case AV_CODEC_ID_MPEG1VIDEO:
+    //   return cudaVideoCodec_MPEG1;
+    // case AV_CODEC_ID_MPEG2VIDEO:
+    //   return cudaVideoCodec_MPEG2;
+    // case AV_CODEC_ID_MJPEG:
+    //   return cudaVideoCodec_JPEG;
+    // case AV_CODEC_ID_VC1:
+    //   return cudaVideoCodec_VC1;
     default: {
       TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
     }
@@ -270,10 +283,17 @@ void BetaCudaDeviceInterface::initializeBSF(
       }
       break;
     }
+    case AV_CODEC_ID_MPEG4: {
+      const std::string formatName =
+          avFormatCtx->iformat->name ? avFormatCtx->iformat->name : "";
+      if (formatName == "avi") {
+        filterName = "mpeg4_unpack_bframes";
+      }
+      break;
+    }
 
     default:
       // No bitstream filter needed for other codecs
-      // TODONVDEC P1 MPEG4 will need one!
       break;
   }
 
@@ -512,19 +532,15 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
   avFrame->format = AV_PIX_FMT_CUDA;
   avFrame->pts = dispInfo.timestamp;
 
-  // TODONVDEC P2: We compute the duration based on average frame rate info:
-  // either from NVCUVID if it's valid, otherwise from FFmpeg as fallback. But
-  // both of these are based on average frame rate, so if the video has
-  // variable frame rate, the durations may be off. We should try to see if we
-  // can set the duration more accurately. Unfortunately it's not given by
-  // dispInfo. One option would be to set it based on the pts difference between
-  // consecutive frames, if the next frame is already available.
-  int frameRateNum = static_cast<int>(videoFormat_.frame_rate.numerator);
-  int frameRateDen = static_cast<int>(videoFormat_.frame_rate.denominator);
-  AVRational frameRate = (frameRateNum > 0 && frameRateDen > 0)
-      ? AVRational{frameRateNum, frameRateDen}
-      : frameRateAvgFromFFmpeg_;
-  setDuration(avFrame, computeSafeDuration(frameRate, timeBase_));
+  // TODONVDEC P2: We compute the duration based on average frame rate info, so
+  // so if the video has variable frame rate, the durations may be off. We
+  // should try to see if we can set the duration more accurately. Unfortunately
+  // it's not given by dispInfo. One option would be to set it based on the pts
+  // difference between consecutive frames, if the next frame is already
+  // available.
+  // Note that we used to rely on videoFormat_.frame_rate for this, but that
+  // proved less accurate than FFmpeg.
+  setDuration(avFrame, computeSafeDuration(frameRateAvgFromFFmpeg_, timeBase_));
 
   // We need to assign the frame colorspace. This is crucial for proper color
   // conversion. NVCUVID stores that in the matrix_coefficients field, but

diff --git a/test/resources/testsrc2_mpeg4.avi b/test/resources/testsrc2_mpeg4.avi
diff --git a/test/resources/testsrc2_vp8.webm b/test/resources/testsrc2_vp8.webm
diff --git a/test/resources/testsrc2_vp9.webm b/test/resources/testsrc2_vp9.webm
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -43,8 +43,12 @@
     SINE_MONO_S32,
     SINE_MONO_S32_44100,
     SINE_MONO_S32_8000,
+    supports_approximate_mode,
     TEST_SRC_2_720P,
     TEST_SRC_2_720P_H265,
+    TEST_SRC_2_720P_MPEG4,
+    TEST_SRC_2_720P_VP8,
+    TEST_SRC_2_720P_VP9,
     unsplit_device_str,
 )
 
@@ -588,7 +592,7 @@ def test_get_frame_at_av1(self, device):
             return
 
         if device == "cuda" and in_fbcode():
-            pytest.skip("AV1 decoding on CUDA is not supported internally")
+            pytest.skip("decoding on CUDA is not supported internally")
 
         decoder = VideoDecoder(AV1_VIDEO.path, device=device)
         device, _ = unsplit_device_str(device)
@@ -1432,15 +1436,20 @@ def test_get_frames_at_tensor_indices(self):
         decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.int))
         decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.float))
 
-    # TODONVDEC P1 unskip equality assertion checks on FFMpeg4. The comparison
-    # checks are failing on very few pixels, e.g.:
+    # TODONVDEC P1:
+    # - unskip equality assertion checks on FFMpeg4. The comparison
+    #   checks are failing on very few pixels, e.g.:
     #
-    # E   Mismatched elements: 648586 / 82944000 (0.8%)
-    # E   Greatest absolute difference: 164 at index (20, 2, 27, 96)
-    # E   Greatest relative difference: inf at index (5, 1, 112, 186)
+    #   E   Mismatched elements: 648586 / 82944000 (0.8%)
+    #   E   Greatest absolute difference: 164 at index (20, 2, 27, 96)
+    #   E   Greatest relative difference: inf at index (5, 1, 112, 186)
     #
-    # So we're skipping them to unblock for now, but we should call
-    # assert_tensor_close_on_at_least or something like that.
+    #   So we're skipping them to unblock for now, but we should call
+    #   assert_tensor_close_on_at_least or something like that.
+    # - unskip equality assertion checks for MPEG4 asset. The frames are decoded
+    #   fine, it's the color conversion that's different. The frame from the
+    #   BETA interface is assumed to be 701 while the one from the default
+    #   interface is 601.
 
     @needs_cuda
     @pytest.mark.parametrize(
@@ -1451,15 +1460,18 @@ def test_get_frames_at_tensor_indices(self):
             BT709_FULL_RANGE,
             TEST_SRC_2_720P_H265,
             AV1_VIDEO,
+            TEST_SRC_2_720P_VP9,
+            TEST_SRC_2_720P_VP8,
+            TEST_SRC_2_720P_MPEG4,
         ),
     )
     @pytest.mark.parametrize("contiguous_indices", (True, False))
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frame_at(
         self, asset, contiguous_indices, seek_mode
     ):
-        if asset == AV1_VIDEO and seek_mode == "approximate":
-            pytest.skip("AV1 asset doesn't work with approximate mode")
+        if seek_mode == "approximate" and not supports_approximate_mode(asset):
+            pytest.skip("asset doesn't work with approximate mode")
 
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
         beta_decoder = VideoDecoder(
@@ -1476,7 +1488,8 @@ def test_beta_cuda_interface_get_frame_at(
         for frame_index in indices:
             ref_frame = ref_decoder.get_frame_at(frame_index)
             beta_frame = beta_decoder.get_frame_at(frame_index)
-            if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+            # TODONVDEC P1 see above
+            if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
                 torch.testing.assert_close(
                     beta_frame.data, ref_frame.data, rtol=0, atol=0
                 )
@@ -1493,15 +1506,18 @@ def test_beta_cuda_interface_get_frame_at(
             BT709_FULL_RANGE,
             TEST_SRC_2_720P_H265,
             AV1_VIDEO,
+            TEST_SRC_2_720P_VP9,
+            TEST_SRC_2_720P_VP8,
+            TEST_SRC_2_720P_MPEG4,
         ),
     )
     @pytest.mark.parametrize("contiguous_indices", (True, False))
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frames_at(
         self, asset, contiguous_indices, seek_mode
     ):
-        if asset == AV1_VIDEO and seek_mode == "approximate":
-            pytest.skip("AV1 asset doesn't work with approximate mode")
+        if seek_mode == "approximate" and not supports_approximate_mode(asset):
+            pytest.skip("asset doesn't work with approximate mode")
 
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
         beta_decoder = VideoDecoder(
@@ -1518,7 +1534,8 @@ def test_beta_cuda_interface_get_frames_at(
 
         ref_frames = ref_decoder.get_frames_at(indices)
         beta_frames = beta_decoder.get_frames_at(indices)
-        if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+        # TODONVDEC P1 see above
+        if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
             torch.testing.assert_close(
                 beta_frames.data, ref_frames.data, rtol=0, atol=0
             )
@@ -1536,12 +1553,15 @@ def test_beta_cuda_interface_get_frames_at(
             BT709_FULL_RANGE,
             TEST_SRC_2_720P_H265,
             AV1_VIDEO,
+            TEST_SRC_2_720P_VP9,
+            TEST_SRC_2_720P_VP8,
+            TEST_SRC_2_720P_MPEG4,
         ),
     )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
-        if asset == AV1_VIDEO and seek_mode == "approximate":
-            pytest.skip("AV1 asset doesn't work with approximate mode")
+        if seek_mode == "approximate" and not supports_approximate_mode(asset):
+            pytest.skip("asset doesn't work with approximate mode")
 
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
         beta_decoder = VideoDecoder(
@@ -1556,7 +1576,8 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
         for pts in timestamps:
             ref_frame = ref_decoder.get_frame_played_at(pts)
             beta_frame = beta_decoder.get_frame_played_at(pts)
-            if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+            # TODONVDEC P1 see above
+            if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
                 torch.testing.assert_close(
                     beta_frame.data, ref_frame.data, rtol=0, atol=0
                 )
@@ -1573,12 +1594,15 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
             BT709_FULL_RANGE,
             TEST_SRC_2_720P_H265,
             AV1_VIDEO,
+            TEST_SRC_2_720P_VP9,
+            TEST_SRC_2_720P_VP8,
+            TEST_SRC_2_720P_MPEG4,
         ),
     )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
-        if asset == AV1_VIDEO and seek_mode == "approximate":
-            pytest.skip("AV1 asset doesn't work with approximate mode")
+        if seek_mode == "approximate" and not supports_approximate_mode(asset):
+            pytest.skip("asset doesn't work with approximate mode")
 
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
         beta_decoder = VideoDecoder(
@@ -1593,7 +1617,8 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
 
         ref_frames = ref_decoder.get_frames_played_at(timestamps)
         beta_frames = beta_decoder.get_frames_played_at(timestamps)
-        if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+        # TODONVDEC P1 see above
+        if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
             torch.testing.assert_close(
                 beta_frames.data, ref_frames.data, rtol=0, atol=0
             )
@@ -1611,12 +1636,15 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
             BT709_FULL_RANGE,
             TEST_SRC_2_720P_H265,
             AV1_VIDEO,
+            TEST_SRC_2_720P_VP9,
+            TEST_SRC_2_720P_VP8,
+            TEST_SRC_2_720P_MPEG4,
         ),
     )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_backwards(self, asset, seek_mode):
-        if asset == AV1_VIDEO and seek_mode == "approximate":
-            pytest.skip("AV1 asset doesn't work with approximate mode")
+        if seek_mode == "approximate" and not supports_approximate_mode(asset):
+            pytest.skip("asset doesn't work with approximate mode")
 
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
         beta_decoder = VideoDecoder(
@@ -1635,7 +1663,8 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode):
 
             ref_frame = ref_decoder.get_frame_at(frame_index)
             beta_frame = beta_decoder.get_frame_at(frame_index)
-            if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+            # TODONVDEC P1 see above
+            if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
                 torch.testing.assert_close(
                     beta_frame.data, ref_frame.data, rtol=0, atol=0
                 )

diff --git a/test/utils.py b/test/utils.py
@@ -717,3 +717,40 @@ def sample_format(self) -> str:
     },
     frames={0: {}},  # Not needed for now
 )
+
+# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx-vp9 -b:v 1M output_vp9.webm
+TEST_SRC_2_720P_VP9 = TestVideo(
+    filename="testsrc2_vp9.webm",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)
+
+# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx -b:v 1M output_vp8.webm
+TEST_SRC_2_720P_VP8 = TestVideo(
+    filename="testsrc2_vp8.webm",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)
+
+# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v mpeg4 -q:v 5 output_mpeg4.avi
+TEST_SRC_2_720P_MPEG4 = TestVideo(
+    filename="testsrc2_mpeg4.avi",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)
+
+
+def supports_approximate_mode(asset: TestVideo) -> bool:
+    # TODONVDEC P2: open an issue about his. That's actually not related to
+    # NVDEC at all, those don't support approximate mode because they don't set
+    # a duration. CPU decoder fails too!
+    return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8)