diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 078655462..333b92bde 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -108,10 +108,30 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { " vs supported:", caps.nMaxMBCount); + // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make + // sure it's actually supported. + TORCH_CHECK( + (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1, + "NV12 output format is not supported for this configuration. ", + "Codec: ", + static_cast(videoFormat->codec), + ", chroma format: ", + static_cast(videoFormat->chroma_format), + ", bit depth: ", + videoFormat->bit_depth_luma_minus8 + 8); + // Decoder creation parameters, most are taken from DALI CUVIDDECODECREATEINFO decoderParams = {}; decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8; decoderParams.ChromaFormat = videoFormat->chroma_format; + // We explicitly request NV12 format, which means 10bit videos will be + // automatically converted to 8bits by NVDEC itself. That is, the raw frames + // we get back from cuvidMapVideoFrame will already be in 8bit format. We + // won't need to do the conversion ourselves, so that's a lot easier. + // In the default interface, we have to do the 10 -> 8bits conversion + // ourselves later in convertAVFrameToFrameOutput(), because FFmpeg explicitly + // requests 10 or 16bits output formats for >8-bit videos! + // https://github.com/FFmpeg/FFmpeg/blob/e05f8acabff468c1382277c1f31fa8e9d90c3202/libavcodec/nvdec.c#L376-L403 decoderParams.OutputFormat = cudaVideoSurfaceFormat_NV12; decoderParams.ulCreationFlags = cudaVideoCreate_Default; decoderParams.CodecType = videoFormat->codec; diff --git a/test/test_decoders.py b/test/test_decoders.py index e5139d089..ca73b99b8 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1288,18 +1288,15 @@ def test_10bit_videos(self, device, asset): # This just validates that we can decode 10-bit videos. # TODO validate against the ref that the decoded frames are correct - if device == "cuda:0:beta": - # This fails on our BETA interface on asset 0 (only!) with: + if device == "cuda:0:beta" and asset is H264_10BITS: + # This fails on the BETA interface with: # # RuntimeError: Codec configuration not supported on this GPU. # Codec: 4, chroma format: 1, bit depth: 10 # - # I don't remember but I suspect asset 0 is actually the one that - # fallsback to the CPU path on the default CUDA interface (that - # would make sense) - # We should investigate if and how we could make that fallback - # happen for the BETA interface. - pytest.skip("TODONVDEC P2 - investigate and unskip") + # It works on the default interface because FFmpeg fallsback to the + # CPU, while the BETA interface doesn't. + pytest.skip("Asset not supported by NVDEC") decoder = VideoDecoder(asset.path, device=device) decoder.get_frame_at(10) @@ -1674,12 +1671,11 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode): @needs_cuda def test_beta_cuda_interface_small_h265(self): - # TODONVDEC P2 investigate why/how the default interface can decode this - # video. + # Test to illustrate current difference in behavior between the BETA and + # the default interface: this video isn't supported by NVDEC, but in the + # default interface, FFMPEG fallsback to the CPU while we don't. - # This is fine on the default interface - why? VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) - # But it fails on the beta interface due to input validation checks, which we took from DALI! with pytest.raises( RuntimeError, match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144",