diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index d55bb1137..7124e4309 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -53,74 +53,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) { } static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { - // Check decoder capabilities - same checks as DALI - auto caps = CUVIDDECODECAPS{}; - caps.eCodecType = videoFormat->codec; - caps.eChromaFormat = videoFormat->chroma_format; - caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8; - CUresult result = cuvidGetDecoderCaps(&caps); - TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result); - - TORCH_CHECK( - caps.bIsSupported, - "Codec configuration not supported on this GPU. " - "Codec: ", - static_cast(videoFormat->codec), - ", chroma format: ", - static_cast(videoFormat->chroma_format), - ", bit depth: ", - videoFormat->bit_depth_luma_minus8 + 8); - - TORCH_CHECK( - videoFormat->coded_width >= caps.nMinWidth && - videoFormat->coded_height >= caps.nMinHeight, - "Video is too small in at least one dimension. Provided: ", - videoFormat->coded_width, - "x", - videoFormat->coded_height, - " vs supported:", - caps.nMinWidth, - "x", - caps.nMinHeight); - - TORCH_CHECK( - videoFormat->coded_width <= caps.nMaxWidth && - videoFormat->coded_height <= caps.nMaxHeight, - "Video is too large in at least one dimension. Provided: ", - videoFormat->coded_width, - "x", - videoFormat->coded_height, - " vs supported:", - caps.nMaxWidth, - "x", - caps.nMaxHeight); - - // See nMaxMBCount in cuviddec.h - constexpr unsigned int macroblockConstant = 256; - TORCH_CHECK( - videoFormat->coded_width * videoFormat->coded_height / - macroblockConstant <= - caps.nMaxMBCount, - "Video is too large (too many macroblocks). " - "Provided (width * height / ", - macroblockConstant, - "): ", - videoFormat->coded_width * videoFormat->coded_height / macroblockConstant, - " vs supported:", - caps.nMaxMBCount); - - // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make - // sure it's actually supported. - TORCH_CHECK( - (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1, - "NV12 output format is not supported for this configuration. ", - "Codec: ", - static_cast(videoFormat->codec), - ", chroma format: ", - static_cast(videoFormat->chroma_format), - ", bit depth: ", - videoFormat->bit_depth_luma_minus8 + 8); - // Decoder creation parameters, most are taken from DALI CUVIDDECODECREATEINFO decoderParams = {}; decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8; @@ -157,13 +89,39 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { decoderParams.display_area.bottom = videoFormat->display_area.bottom; CUvideodecoder* decoder = new CUvideodecoder(); - result = cuvidCreateDecoder(decoder, &decoderParams); + CUresult result = cuvidCreateDecoder(decoder, &decoderParams); TORCH_CHECK( result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result); return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{}); } -cudaVideoCodec validateCodecSupport(AVCodecID codecId) { +std::optional validateChromaSupport( + const AVPixFmtDescriptor* desc) { + // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt + // otherwise. + TORCH_CHECK(desc != nullptr, "desc can't be null"); + + if (desc->nb_components == 1) { + return cudaVideoChromaFormat_Monochrome; + } else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) { + // Make sure it's YUV: has chroma planes and isn't RGB + if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) { + return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4 + } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) { + return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0 + } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) { + return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2 + } + } + + return std::nullopt; +} + +std::optional validateCodecSupport(AVCodecID codecId) { + // Return the corresponding cudaVideoCodec if supported, std::nullopt + // otherwise + // Note that we currently return nullopt (and thus fallback to CPU) for some + // codecs that are technically supported by NVDEC, see comment below. switch (codecId) { case AV_CODEC_ID_H264: return cudaVideoCodec_H264; @@ -189,10 +147,69 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) { // return cudaVideoCodec_JPEG; // case AV_CODEC_ID_VC1: // return cudaVideoCodec_VC1; - default: { - TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId)); - } + default: + return std::nullopt; + } +} + +bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { + // Return true iff the input video stream is supported by our NVDEC + // implementation. + auto codecType = validateCodecSupport(codecContext->codec_id); + if (!codecType.has_value()) { + return false; + } + + const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt); + if (!desc) { + return false; + } + + auto chromaFormat = validateChromaSupport(desc); + if (!chromaFormat.has_value()) { + return false; + } + + auto caps = CUVIDDECODECAPS{}; + caps.eCodecType = codecType.value(); + caps.eChromaFormat = chromaFormat.value(); + caps.nBitDepthMinus8 = desc->comp[0].depth - 8; + + CUresult result = cuvidGetDecoderCaps(&caps); + if (result != CUDA_SUCCESS) { + return false; + } + + if (!caps.bIsSupported) { + return false; + } + + auto coded_width = static_cast(codecContext->coded_width); + auto coded_height = static_cast(codecContext->coded_height); + if (coded_width < static_cast(caps.nMinWidth) || + coded_height < static_cast(caps.nMinHeight) || + coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight) { + return false; + } + + // See nMaxMBCount in cuviddec.h + constexpr unsigned int macroblockConstant = 256; + if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount) { + return false; + } + + // We'll set the decoderParams.OutputFormat to NV12, so we need to make + // sure it's actually supported. + // TODO: If this fail, we could consider decoding to something else than NV12 + // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is + // what FFmpeg does. + bool supportsNV12Output = + (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1; + if (!supportsNV12Output) { + return false; } + + return true; } } // namespace @@ -232,6 +249,19 @@ void BetaCudaDeviceInterface::initialize( const AVStream* avStream, const UniqueDecodingAVFormatContext& avFormatCtx, [[maybe_unused]] const SharedAVCodecContext& codecContext) { + if (!nativeNVDECSupport(codecContext)) { + cpuFallback_ = createDeviceInterface(torch::kCPU); + TORCH_CHECK( + cpuFallback_ != nullptr, "Failed to create CPU device interface"); + cpuFallback_->initialize(avStream, avFormatCtx, codecContext); + cpuFallback_->initializeVideo( + VideoStreamOptions(), + {}, + /*resizedOutputDims=*/std::nullopt); + // We'll always use the CPU fallback from now on, so we can return early. + return; + } + TORCH_CHECK(avStream != nullptr, "AVStream cannot be null"); timeBase_ = avStream->time_base; frameRateAvgFromFFmpeg_ = avStream->r_frame_rate; @@ -243,7 +273,11 @@ void BetaCudaDeviceInterface::initialize( // Create parser. Default values that aren't obvious are taken from DALI. CUVIDPARSERPARAMS parserParams = {}; - parserParams.CodecType = validateCodecSupport(codecPar->codec_id); + auto codecType = validateCodecSupport(codecPar->codec_id); + TORCH_CHECK( + codecType.has_value(), + "This should never happen, we should be using the CPU fallback by now. Please report a bug."); + parserParams.CodecType = codecType.value(); parserParams.ulMaxNumDecodeSurfaces = 8; parserParams.ulMaxDisplayDelay = 0; // Callback setup, all are triggered by the parser within a call @@ -383,6 +417,10 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to // the NVCUVID parser. int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { + if (cpuFallback_) { + return cpuFallback_->sendPacket(packet); + } + TORCH_CHECK( packet.get() && packet->data && packet->size > 0, "sendPacket received an empty packet, this is unexpected, please report."); @@ -406,6 +444,10 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { } int BetaCudaDeviceInterface::sendEOFPacket() { + if (cpuFallback_) { + return cpuFallback_->sendEOFPacket(); + } + CUVIDSOURCEDATAPACKET cuvidPacket = {}; cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM; eofSent_ = true; @@ -467,6 +509,10 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder( // Moral equivalent of avcodec_receive_frame(). int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) { + if (cpuFallback_) { + return cpuFallback_->receiveFrame(avFrame); + } + if (readyFrames_.empty()) { // No frame found, instruct caller to try again later after sending more // packets, or to stop if EOF was already sent. @@ -601,6 +647,11 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame( } void BetaCudaDeviceInterface::flush() { + if (cpuFallback_) { + cpuFallback_->flush(); + return; + } + // The NVCUVID docs mention that after seeking, i.e. when flush() is called, // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs // don't say whether this should be an empty packet, or whether it should be a @@ -618,6 +669,21 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) { + if (cpuFallback_) { + // CPU decoded frame - need to do CPU color conversion then transfer to GPU + FrameOutput cpuFrameOutput; + cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput); + + // Transfer CPU frame to GPU + if (preAllocatedOutputTensor.has_value()) { + preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data); + frameOutput.data = preAllocatedOutputTensor.value(); + } else { + frameOutput.data = cpuFrameOutput.data.to(device_); + } + return; + } + // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24(). TORCH_CHECK( diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index fb01415d4..7424a877d 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -94,6 +94,8 @@ class BetaCudaDeviceInterface : public DeviceInterface { // NPP context for color conversion UniqueNppContext nppCtx_; + + std::unique_ptr cpuFallback_; }; } // namespace facebook::torchcodec diff --git a/test/test_decoders.py b/test/test_decoders.py index 32367b438..098e4e969 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1701,20 +1701,19 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode): assert beta_frame.duration_seconds == ref_frame.duration_seconds @needs_cuda - def test_beta_cuda_interface_small_h265(self): - # Test to illustrate current difference in behavior between the BETA and - # the ffmpeg interface: this video isn't supported by NVDEC, but in the - # ffmpeg interface, FFMPEG fallsback to the CPU while we don't. - - VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) - + def test_beta_cuda_interface_cpu_fallback(self): + # Non-regression test for the CPU fallback behavior of the BETA CUDA + # interface. + # We know that the H265_VIDEO asset isn't supported by NVDEC, its + # dimensions are too small. We also know that the FFmpeg CUDA interface + # fallbacks to the CPU path in such cases. We assert that we fall back + # to the CPU path, too. + + ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) with set_cuda_backend("beta"): - dec = VideoDecoder(H265_VIDEO.path, device="cuda") - with pytest.raises( - RuntimeError, - match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144", - ): - dec.get_frame_at(0) + beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) + + torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0) @needs_cuda def test_beta_cuda_interface_error(self): @@ -1740,15 +1739,20 @@ def test_set_cuda_backend(self): assert _get_cuda_backend() == "beta" def assert_decoder_uses(decoder, *, expected_backend): + # TODO: This doesn't work anymore after + # https://github.com/meta-pytorch/torchcodec/pull/977 + # We need to define a better way to assert which backend a decoder + # is using. + return # Assert that a decoder instance is using a given backend. # # We know H265_VIDEO fails on the BETA backend while it works on the # ffmpeg one. - if expected_backend == "ffmpeg": - decoder.get_frame_at(0) # this would fail if this was BETA - else: - with pytest.raises(RuntimeError, match="Video is too small"): - decoder.get_frame_at(0) + # if expected_backend == "ffmpeg": + # decoder.get_frame_at(0) # this would fail if this was BETA + # else: + # with pytest.raises(RuntimeError, match="Video is too small"): + # decoder.get_frame_at(0) # Check that the default is the ffmpeg backend assert _get_cuda_backend() == "ffmpeg"