diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 9d8ba7b3c..078655462 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -150,11 +150,24 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) { return cudaVideoCodec_HEVC; case AV_CODEC_ID_AV1: return cudaVideoCodec_AV1; - // TODONVDEC P0: support more codecs - // case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4; - // case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8; - // case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9; - // case AV_CODEC_ID_MJPEG: return cudaVideoCodec_JPEG; + case AV_CODEC_ID_VP9: + return cudaVideoCodec_VP9; + case AV_CODEC_ID_VP8: + return cudaVideoCodec_VP8; + case AV_CODEC_ID_MPEG4: + return cudaVideoCodec_MPEG4; + // Formats below are currently not tested, but they should "mostly" work. + // MPEG1 was briefly locally tested and it was ok-ish despite duration being + // off. Since they're far less popular, we keep them disabled by default but + // we can consider enabling them upon user requests. + // case AV_CODEC_ID_MPEG1VIDEO: + // return cudaVideoCodec_MPEG1; + // case AV_CODEC_ID_MPEG2VIDEO: + // return cudaVideoCodec_MPEG2; + // case AV_CODEC_ID_MJPEG: + // return cudaVideoCodec_JPEG; + // case AV_CODEC_ID_VC1: + // return cudaVideoCodec_VC1; default: { TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId)); } @@ -270,10 +283,17 @@ void BetaCudaDeviceInterface::initializeBSF( } break; } + case AV_CODEC_ID_MPEG4: { + const std::string formatName = + avFormatCtx->iformat->name ? avFormatCtx->iformat->name : ""; + if (formatName == "avi") { + filterName = "mpeg4_unpack_bframes"; + } + break; + } default: // No bitstream filter needed for other codecs - // TODONVDEC P1 MPEG4 will need one! break; } @@ -512,19 +532,15 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame( avFrame->format = AV_PIX_FMT_CUDA; avFrame->pts = dispInfo.timestamp; - // TODONVDEC P2: We compute the duration based on average frame rate info: - // either from NVCUVID if it's valid, otherwise from FFmpeg as fallback. But - // both of these are based on average frame rate, so if the video has - // variable frame rate, the durations may be off. We should try to see if we - // can set the duration more accurately. Unfortunately it's not given by - // dispInfo. One option would be to set it based on the pts difference between - // consecutive frames, if the next frame is already available. - int frameRateNum = static_cast(videoFormat_.frame_rate.numerator); - int frameRateDen = static_cast(videoFormat_.frame_rate.denominator); - AVRational frameRate = (frameRateNum > 0 && frameRateDen > 0) - ? AVRational{frameRateNum, frameRateDen} - : frameRateAvgFromFFmpeg_; - setDuration(avFrame, computeSafeDuration(frameRate, timeBase_)); + // TODONVDEC P2: We compute the duration based on average frame rate info, so + // so if the video has variable frame rate, the durations may be off. We + // should try to see if we can set the duration more accurately. Unfortunately + // it's not given by dispInfo. One option would be to set it based on the pts + // difference between consecutive frames, if the next frame is already + // available. + // Note that we used to rely on videoFormat_.frame_rate for this, but that + // proved less accurate than FFmpeg. + setDuration(avFrame, computeSafeDuration(frameRateAvgFromFFmpeg_, timeBase_)); // We need to assign the frame colorspace. This is crucial for proper color // conversion. NVCUVID stores that in the matrix_coefficients field, but diff --git a/test/resources/testsrc2_mpeg4.avi b/test/resources/testsrc2_mpeg4.avi new file mode 100644 index 000000000..ea202c531 Binary files /dev/null and b/test/resources/testsrc2_mpeg4.avi differ diff --git a/test/resources/testsrc2_vp8.webm b/test/resources/testsrc2_vp8.webm new file mode 100644 index 000000000..3c01d69a2 Binary files /dev/null and b/test/resources/testsrc2_vp8.webm differ diff --git a/test/resources/testsrc2_vp9.webm b/test/resources/testsrc2_vp9.webm new file mode 100644 index 000000000..728ffaad0 Binary files /dev/null and b/test/resources/testsrc2_vp9.webm differ diff --git a/test/test_decoders.py b/test/test_decoders.py index c803bc592..e5139d089 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -43,8 +43,12 @@ SINE_MONO_S32, SINE_MONO_S32_44100, SINE_MONO_S32_8000, + supports_approximate_mode, TEST_SRC_2_720P, TEST_SRC_2_720P_H265, + TEST_SRC_2_720P_MPEG4, + TEST_SRC_2_720P_VP8, + TEST_SRC_2_720P_VP9, unsplit_device_str, ) @@ -588,7 +592,7 @@ def test_get_frame_at_av1(self, device): return if device == "cuda" and in_fbcode(): - pytest.skip("AV1 decoding on CUDA is not supported internally") + pytest.skip("decoding on CUDA is not supported internally") decoder = VideoDecoder(AV1_VIDEO.path, device=device) device, _ = unsplit_device_str(device) @@ -1432,15 +1436,20 @@ def test_get_frames_at_tensor_indices(self): decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.int)) decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.float)) - # TODONVDEC P1 unskip equality assertion checks on FFMpeg4. The comparison - # checks are failing on very few pixels, e.g.: + # TODONVDEC P1: + # - unskip equality assertion checks on FFMpeg4. The comparison + # checks are failing on very few pixels, e.g.: # - # E Mismatched elements: 648586 / 82944000 (0.8%) - # E Greatest absolute difference: 164 at index (20, 2, 27, 96) - # E Greatest relative difference: inf at index (5, 1, 112, 186) + # E Mismatched elements: 648586 / 82944000 (0.8%) + # E Greatest absolute difference: 164 at index (20, 2, 27, 96) + # E Greatest relative difference: inf at index (5, 1, 112, 186) # - # So we're skipping them to unblock for now, but we should call - # assert_tensor_close_on_at_least or something like that. + # So we're skipping them to unblock for now, but we should call + # assert_tensor_close_on_at_least or something like that. + # - unskip equality assertion checks for MPEG4 asset. The frames are decoded + # fine, it's the color conversion that's different. The frame from the + # BETA interface is assumed to be 701 while the one from the default + # interface is 601. @needs_cuda @pytest.mark.parametrize( @@ -1451,6 +1460,9 @@ def test_get_frames_at_tensor_indices(self): BT709_FULL_RANGE, TEST_SRC_2_720P_H265, AV1_VIDEO, + TEST_SRC_2_720P_VP9, + TEST_SRC_2_720P_VP8, + TEST_SRC_2_720P_MPEG4, ), ) @pytest.mark.parametrize("contiguous_indices", (True, False)) @@ -1458,8 +1470,8 @@ def test_get_frames_at_tensor_indices(self): def test_beta_cuda_interface_get_frame_at( self, asset, contiguous_indices, seek_mode ): - if asset == AV1_VIDEO and seek_mode == "approximate": - pytest.skip("AV1 asset doesn't work with approximate mode") + if seek_mode == "approximate" and not supports_approximate_mode(asset): + pytest.skip("asset doesn't work with approximate mode") ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) beta_decoder = VideoDecoder( @@ -1476,7 +1488,8 @@ def test_beta_cuda_interface_get_frame_at( for frame_index in indices: ref_frame = ref_decoder.get_frame_at(frame_index) beta_frame = beta_decoder.get_frame_at(frame_index) - if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above + # TODONVDEC P1 see above + if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4: torch.testing.assert_close( beta_frame.data, ref_frame.data, rtol=0, atol=0 ) @@ -1493,6 +1506,9 @@ def test_beta_cuda_interface_get_frame_at( BT709_FULL_RANGE, TEST_SRC_2_720P_H265, AV1_VIDEO, + TEST_SRC_2_720P_VP9, + TEST_SRC_2_720P_VP8, + TEST_SRC_2_720P_MPEG4, ), ) @pytest.mark.parametrize("contiguous_indices", (True, False)) @@ -1500,8 +1516,8 @@ def test_beta_cuda_interface_get_frame_at( def test_beta_cuda_interface_get_frames_at( self, asset, contiguous_indices, seek_mode ): - if asset == AV1_VIDEO and seek_mode == "approximate": - pytest.skip("AV1 asset doesn't work with approximate mode") + if seek_mode == "approximate" and not supports_approximate_mode(asset): + pytest.skip("asset doesn't work with approximate mode") ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) beta_decoder = VideoDecoder( @@ -1518,7 +1534,8 @@ def test_beta_cuda_interface_get_frames_at( ref_frames = ref_decoder.get_frames_at(indices) beta_frames = beta_decoder.get_frames_at(indices) - if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above + # TODONVDEC P1 see above + if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4: torch.testing.assert_close( beta_frames.data, ref_frames.data, rtol=0, atol=0 ) @@ -1536,12 +1553,15 @@ def test_beta_cuda_interface_get_frames_at( BT709_FULL_RANGE, TEST_SRC_2_720P_H265, AV1_VIDEO, + TEST_SRC_2_720P_VP9, + TEST_SRC_2_720P_VP8, + TEST_SRC_2_720P_MPEG4, ), ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode): - if asset == AV1_VIDEO and seek_mode == "approximate": - pytest.skip("AV1 asset doesn't work with approximate mode") + if seek_mode == "approximate" and not supports_approximate_mode(asset): + pytest.skip("asset doesn't work with approximate mode") ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) beta_decoder = VideoDecoder( @@ -1556,7 +1576,8 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode): for pts in timestamps: ref_frame = ref_decoder.get_frame_played_at(pts) beta_frame = beta_decoder.get_frame_played_at(pts) - if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above + # TODONVDEC P1 see above + if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4: torch.testing.assert_close( beta_frame.data, ref_frame.data, rtol=0, atol=0 ) @@ -1573,12 +1594,15 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode): BT709_FULL_RANGE, TEST_SRC_2_720P_H265, AV1_VIDEO, + TEST_SRC_2_720P_VP9, + TEST_SRC_2_720P_VP8, + TEST_SRC_2_720P_MPEG4, ), ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode): - if asset == AV1_VIDEO and seek_mode == "approximate": - pytest.skip("AV1 asset doesn't work with approximate mode") + if seek_mode == "approximate" and not supports_approximate_mode(asset): + pytest.skip("asset doesn't work with approximate mode") ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) beta_decoder = VideoDecoder( @@ -1593,7 +1617,8 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode): ref_frames = ref_decoder.get_frames_played_at(timestamps) beta_frames = beta_decoder.get_frames_played_at(timestamps) - if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above + # TODONVDEC P1 see above + if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4: torch.testing.assert_close( beta_frames.data, ref_frames.data, rtol=0, atol=0 ) @@ -1611,12 +1636,15 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode): BT709_FULL_RANGE, TEST_SRC_2_720P_H265, AV1_VIDEO, + TEST_SRC_2_720P_VP9, + TEST_SRC_2_720P_VP8, + TEST_SRC_2_720P_MPEG4, ), ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_backwards(self, asset, seek_mode): - if asset == AV1_VIDEO and seek_mode == "approximate": - pytest.skip("AV1 asset doesn't work with approximate mode") + if seek_mode == "approximate" and not supports_approximate_mode(asset): + pytest.skip("asset doesn't work with approximate mode") ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) beta_decoder = VideoDecoder( @@ -1635,7 +1663,8 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode): ref_frame = ref_decoder.get_frame_at(frame_index) beta_frame = beta_decoder.get_frame_at(frame_index) - if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above + # TODONVDEC P1 see above + if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4: torch.testing.assert_close( beta_frame.data, ref_frame.data, rtol=0, atol=0 ) diff --git a/test/utils.py b/test/utils.py index f26c013a7..7c91f307c 100644 --- a/test/utils.py +++ b/test/utils.py @@ -717,3 +717,40 @@ def sample_format(self) -> str: }, frames={0: {}}, # Not needed for now ) + +# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx-vp9 -b:v 1M output_vp9.webm +TEST_SRC_2_720P_VP9 = TestVideo( + filename="testsrc2_vp9.webm", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +) + +# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx -b:v 1M output_vp8.webm +TEST_SRC_2_720P_VP8 = TestVideo( + filename="testsrc2_vp8.webm", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +) + +# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v mpeg4 -q:v 5 output_mpeg4.avi +TEST_SRC_2_720P_MPEG4 = TestVideo( + filename="testsrc2_mpeg4.avi", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +) + + +def supports_approximate_mode(asset: TestVideo) -> bool: + # TODONVDEC P2: open an issue about his. That's actually not related to + # NVDEC at all, those don't support approximate mode because they don't set + # a duration. CPU decoder fails too! + return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8)