From 718c268fc6d0a1c158aea6f666f3baa4240b6d54 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Fri, 10 Oct 2025 14:51:34 -0700 Subject: [PATCH 01/18] Refactor receiveFrame and sendPacket logic to dispatch directly to interface --- .../_core/BetaCudaDeviceInterface.h | 3 -- src/torchcodec/_core/DeviceInterface.h | 50 ++++++++++--------- src/torchcodec/_core/SingleStreamDecoder.cpp | 34 ++++--------- 3 files changed, 37 insertions(+), 50 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index 0bf9951d6..8206a7b1b 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -48,9 +48,6 @@ class BetaCudaDeviceInterface : public DeviceInterface { std::optional preAllocatedOutputTensor = std::nullopt) override; - bool canDecodePacketDirectly() const override { - return true; - } int sendPacket(ReferenceAVPacket& packet) override; int sendEOFPacket() override; diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h index cac29e838..967484618 100644 --- a/src/torchcodec/_core/DeviceInterface.h +++ b/src/torchcodec/_core/DeviceInterface.h @@ -80,42 +80,45 @@ class DeviceInterface { // Extension points for custom decoding paths // ------------------------------------------ - // Override to return true if this device interface can decode packets - // directly. This means that the following two member functions can both - // be called: - // - // 1. sendPacket() - // 2. receiveFrame() - virtual bool canDecodePacketDirectly() const { - return false; + // Set the codec context for default FFmpeg decoding operations + // This must be called during initialization before using + // sendPacket/receiveFrame + virtual void setCodecContext(AVCodecContext* codecContext) { + codecContext_ = codecContext; } - // Moral equivalent of avcodec_send_packet() // Returns AVSUCCESS on success, AVERROR(EAGAIN) if decoder queue full, or // other AVERROR on failure - virtual int sendPacket([[maybe_unused]] ReferenceAVPacket& avPacket) { - TORCH_CHECK( - false, - "Send/receive packet decoding not implemented for this device interface"); - return AVERROR(ENOSYS); + // Default implementation uses FFmpeg directly + virtual int sendPacket(ReferenceAVPacket& avPacket) { + if (!codecContext_) { + TORCH_CHECK( + false, "Codec context not available for default packet sending"); + return AVERROR(EINVAL); + } + return avcodec_send_packet(codecContext_, avPacket.get()); } // Send an EOF packet to flush the decoder // Returns AVSUCCESS on success, or other AVERROR on failure + // Default implementation uses FFmpeg directly virtual int sendEOFPacket() { - TORCH_CHECK( - false, "Send EOF packet not implemented for this device interface"); - return AVERROR(ENOSYS); + if (!codecContext_) { + TORCH_CHECK(false, "Codec context not available for EOF packet sending"); + return AVERROR(EINVAL); + } + return avcodec_send_packet(codecContext_, nullptr); } - // Moral equivalent of avcodec_receive_frame() // Returns AVSUCCESS on success, AVERROR(EAGAIN) if no frame ready, // AVERROR_EOF if end of stream, or other AVERROR on failure - virtual int receiveFrame([[maybe_unused]] UniqueAVFrame& avFrame) { - TORCH_CHECK( - false, - "Send/receive packet decoding not implemented for this device interface"); - return AVERROR(ENOSYS); + // Default implementation uses FFmpeg directly + virtual int receiveFrame(UniqueAVFrame& avFrame) { + if (!codecContext_) { + TORCH_CHECK(false, "Codec context not available for frame receiving"); + return AVERROR(EINVAL); + } + return avcodec_receive_frame(codecContext_, avFrame.get()); } // Flush remaining frames from decoder @@ -126,6 +129,7 @@ class DeviceInterface { protected: torch::Device device_; + AVCodecContext* codecContext_ = nullptr; // Non-owning pointer }; using CreateDeviceInterfaceFn = diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index d06c47922..dfbe72d6f 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -459,6 +459,10 @@ void SingleStreamDecoder::addStream( codecContext->time_base = streamInfo.stream->time_base; + // Set the codec context on the device interface for default FFmpeg + // implementations + deviceInterface_->setCodecContext(codecContext); + containerMetadata_.allStreamMetadata[activeStreamIndex_].codecName = std::string(avcodec_get_name(codecContext->codec_id)); @@ -1169,24 +1173,16 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame( cursorWasJustSet_ = false; } - StreamInfo& streamInfo = streamInfos_[activeStreamIndex_]; UniqueAVFrame avFrame(av_frame_alloc()); AutoAVPacket autoAVPacket; int status = AVSUCCESS; bool reachedEOF = false; - // TODONVDEC P2: Instead of calling canDecodePacketDirectly() and rely on - // if/else blocks to dispatch to the interface or to FFmpeg, consider *always* - // dispatching to the interface. The default implementation of the interface's - // receiveFrame and sendPacket could just be calling avcodec_receive_frame and - // avcodec_send_packet. This would make the decoding loop even more generic. + // The default implementation uses avcodec_receive_frame and + // avcodec_send_packet, while specialized interfaces can override for + // hardware-specific optimizations. while (true) { - if (deviceInterface_->canDecodePacketDirectly()) { - status = deviceInterface_->receiveFrame(avFrame); - } else { - status = - avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get()); - } + status = deviceInterface_->receiveFrame(avFrame); if (status != AVSUCCESS && status != AVERROR(EAGAIN)) { // Non-retriable error @@ -1222,13 +1218,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame( if (status == AVERROR_EOF) { // End of file reached. We must drain the decoder - if (deviceInterface_->canDecodePacketDirectly()) { - status = deviceInterface_->sendEOFPacket(); - } else { - status = avcodec_send_packet( - streamInfo.codecContext.get(), - /*avpkt=*/nullptr); - } + status = deviceInterface_->sendEOFPacket(); TORCH_CHECK( status >= AVSUCCESS, "Could not flush decoder: ", @@ -1253,11 +1243,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame( // We got a valid packet. Send it to the decoder, and we'll receive it in // the next iteration. - if (deviceInterface_->canDecodePacketDirectly()) { - status = deviceInterface_->sendPacket(packet); - } else { - status = avcodec_send_packet(streamInfo.codecContext.get(), packet.get()); - } + status = deviceInterface_->sendPacket(packet); TORCH_CHECK( status >= AVSUCCESS, "Could not push packet to decoder: ", From 30ee0e88b1dbe75c1e3d7d9f61a3370627b1a947 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Fri, 10 Oct 2025 15:04:10 -0700 Subject: [PATCH 02/18] ran precommit --- src/torchcodec/_core/BetaCudaDeviceInterface.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index 8206a7b1b..c59e48140 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -48,7 +48,6 @@ class BetaCudaDeviceInterface : public DeviceInterface { std::optional preAllocatedOutputTensor = std::nullopt) override; - int sendPacket(ReferenceAVPacket& packet) override; int sendEOFPacket() override; int receiveFrame(UniqueAVFrame& avFrame) override; From 11d7adfb62543ffc799e3acfb416aee164098fd8 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Wed, 15 Oct 2025 08:13:54 -0700 Subject: [PATCH 03/18] use shared_ptr for codecContext --- .../_core/BetaCudaDeviceInterface.cpp | 4 +- .../_core/BetaCudaDeviceInterface.h | 3 +- src/torchcodec/_core/CpuDeviceInterface.cpp | 4 +- src/torchcodec/_core/CpuDeviceInterface.h | 3 +- src/torchcodec/_core/CudaDeviceInterface.cpp | 6 ++- src/torchcodec/_core/CudaDeviceInterface.h | 3 +- src/torchcodec/_core/DeviceInterface.h | 39 ++++++++----------- src/torchcodec/_core/FFMPEGCommon.cpp | 9 +++++ src/torchcodec/_core/FFMPEGCommon.h | 3 ++ src/torchcodec/_core/SingleStreamDecoder.cpp | 19 +++++---- src/torchcodec/_core/SingleStreamDecoder.h | 2 +- 11 files changed, 54 insertions(+), 41 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 78fa8d635..3cc449d09 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -231,8 +231,10 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() { void BetaCudaDeviceInterface::initialize( const AVStream* avStream, - const UniqueDecodingAVFormatContext& avFormatCtx) { + const UniqueDecodingAVFormatContext& avFormatCtx, + const SharedAVCodecContext& codecContext) { TORCH_CHECK(avStream != nullptr, "AVStream cannot be null"); + codecContext_ = codecContext; timeBase_ = avStream->time_base; frameRateAvgFromFFmpeg_ = avStream->r_frame_rate; diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index c59e48140..fb01415d4 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -40,7 +40,8 @@ class BetaCudaDeviceInterface : public DeviceInterface { void initialize( const AVStream* avStream, - const UniqueDecodingAVFormatContext& avFormatCtx) override; + const UniqueDecodingAVFormatContext& avFormatCtx, + const SharedAVCodecContext& codecContext) override; void convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp index e6b96e3e4..0e9b46434 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.cpp +++ b/src/torchcodec/_core/CpuDeviceInterface.cpp @@ -48,8 +48,10 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device) void CpuDeviceInterface::initialize( const AVStream* avStream, - [[maybe_unused]] const UniqueDecodingAVFormatContext& avFormatCtx) { + [[maybe_unused]] const UniqueDecodingAVFormatContext& avFormatCtx, + const SharedAVCodecContext& codecContext) { TORCH_CHECK(avStream != nullptr, "avStream is null"); + codecContext_ = codecContext; timeBase_ = avStream->time_base; } diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h index 399b0c6be..9f44c4e8c 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.h +++ b/src/torchcodec/_core/CpuDeviceInterface.h @@ -25,7 +25,8 @@ class CpuDeviceInterface : public DeviceInterface { virtual void initialize( const AVStream* avStream, - const UniqueDecodingAVFormatContext& avFormatCtx) override; + const UniqueDecodingAVFormatContext& avFormatCtx, + const SharedAVCodecContext& codecContext) override; virtual void initializeVideo( const VideoStreamOptions& videoStreamOptions, diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index aea2b2d9a..ba8e495b8 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -117,15 +117,17 @@ CudaDeviceInterface::~CudaDeviceInterface() { void CudaDeviceInterface::initialize( const AVStream* avStream, - const UniqueDecodingAVFormatContext& avFormatCtx) { + const UniqueDecodingAVFormatContext& avFormatCtx, + const SharedAVCodecContext& codecContext) { TORCH_CHECK(avStream != nullptr, "avStream is null"); + codecContext_ = codecContext; timeBase_ = avStream->time_base; // TODO: Ideally, we should keep all interface implementations independent. cpuInterface_ = createDeviceInterface(torch::kCPU); TORCH_CHECK( cpuInterface_ != nullptr, "Failed to create CPU device interface"); - cpuInterface_->initialize(avStream, avFormatCtx); + cpuInterface_->initialize(avStream, avFormatCtx, codecContext); cpuInterface_->initializeVideo( VideoStreamOptions(), {}, diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h index 1a8f184ec..d240066f4 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.h +++ b/src/torchcodec/_core/CudaDeviceInterface.h @@ -22,7 +22,8 @@ class CudaDeviceInterface : public DeviceInterface { void initialize( const AVStream* avStream, - const UniqueDecodingAVFormatContext& avFormatCtx) override; + const UniqueDecodingAVFormatContext& avFormatCtx, + const SharedAVCodecContext& codecContext) override; void initializeVideo( const VideoStreamOptions& videoStreamOptions, diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h index 967484618..b6c88438e 100644 --- a/src/torchcodec/_core/DeviceInterface.h +++ b/src/torchcodec/_core/DeviceInterface.h @@ -54,7 +54,8 @@ class DeviceInterface { // Initialize the device with parameters generic to all kinds of decoding. virtual void initialize( const AVStream* avStream, - const UniqueDecodingAVFormatContext& avFormatCtx) = 0; + const UniqueDecodingAVFormatContext& avFormatCtx, + const SharedAVCodecContext& codecContext) = 0; // Initialize the device with parameters specific to video decoding. There is // a default empty implementation. @@ -80,23 +81,14 @@ class DeviceInterface { // Extension points for custom decoding paths // ------------------------------------------ - // Set the codec context for default FFmpeg decoding operations - // This must be called during initialization before using - // sendPacket/receiveFrame - virtual void setCodecContext(AVCodecContext* codecContext) { - codecContext_ = codecContext; - } - // Returns AVSUCCESS on success, AVERROR(EAGAIN) if decoder queue full, or // other AVERROR on failure // Default implementation uses FFmpeg directly virtual int sendPacket(ReferenceAVPacket& avPacket) { - if (!codecContext_) { - TORCH_CHECK( - false, "Codec context not available for default packet sending"); - return AVERROR(EINVAL); - } - return avcodec_send_packet(codecContext_, avPacket.get()); + TORCH_CHECK( + codecContext_ != nullptr, + "Codec context not available for default packet sending"); + return avcodec_send_packet(codecContext_.get(), avPacket.get()); } // Send an EOF packet to flush the decoder @@ -107,29 +99,30 @@ class DeviceInterface { TORCH_CHECK(false, "Codec context not available for EOF packet sending"); return AVERROR(EINVAL); } - return avcodec_send_packet(codecContext_, nullptr); + return avcodec_send_packet(codecContext_.get(), nullptr); } // Returns AVSUCCESS on success, AVERROR(EAGAIN) if no frame ready, // AVERROR_EOF if end of stream, or other AVERROR on failure // Default implementation uses FFmpeg directly virtual int receiveFrame(UniqueAVFrame& avFrame) { - if (!codecContext_) { - TORCH_CHECK(false, "Codec context not available for frame receiving"); - return AVERROR(EINVAL); - } - return avcodec_receive_frame(codecContext_, avFrame.get()); + TORCH_CHECK( + codecContext_ != nullptr, + "Codec context not available for default frame receiving"); + return avcodec_receive_frame(codecContext_.get(), avFrame.get()); } // Flush remaining frames from decoder virtual void flush() { - // Default implementation is no-op for standard decoders - // Custom decoders can override this method + TORCH_CHECK( + codecContext_ != nullptr, + "Codec context not available for default flushing"); + avcodec_flush_buffers(codecContext_.get()); } protected: torch::Device device_; - AVCodecContext* codecContext_ = nullptr; // Non-owning pointer + SharedAVCodecContext codecContext_; }; using CreateDeviceInterfaceFn = diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp index 0570f06cf..40f918fe6 100644 --- a/src/torchcodec/_core/FFMPEGCommon.cpp +++ b/src/torchcodec/_core/FFMPEGCommon.cpp @@ -158,6 +158,15 @@ int getNumChannels(const UniqueAVCodecContext& avCodecContext) { #endif } +int getNumChannels(const SharedAVCodecContext& avCodecContext) { +#if LIBAVFILTER_VERSION_MAJOR > 8 || \ + (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44) + return avCodecContext->ch_layout.nb_channels; +#else + return avCodecContext->channels; +#endif +} + void setDefaultChannelLayout( UniqueAVCodecContext& avCodecContext, int numChannels) { diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h index 19cddcc37..5ca1e732c 100644 --- a/src/torchcodec/_core/FFMPEGCommon.h +++ b/src/torchcodec/_core/FFMPEGCommon.h @@ -71,6 +71,8 @@ using UniqueEncodingAVFormatContext = std::unique_ptr< using UniqueAVCodecContext = std::unique_ptr< AVCodecContext, Deleterp>; +using SharedAVCodecContext = std::shared_ptr; + using UniqueAVFrame = std::unique_ptr>; using UniqueAVFilterGraph = std::unique_ptr< @@ -172,6 +174,7 @@ const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec); int getNumChannels(const UniqueAVFrame& avFrame); int getNumChannels(const UniqueAVCodecContext& avCodecContext); +int getNumChannels(const SharedAVCodecContext& avCodecContext); void setDefaultChannelLayout( UniqueAVCodecContext& avCodecContext, diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index dfbe72d6f..0a7b89691 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -429,7 +429,6 @@ void SingleStreamDecoder::addStream( TORCH_CHECK( deviceInterface_ != nullptr, "Failed to create device interface. This should never happen, please report."); - deviceInterface_->initialize(streamInfo.stream, formatContext_); // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within // addStream() which is supposed to be generic @@ -441,7 +440,8 @@ void SingleStreamDecoder::addStream( AVCodecContext* codecContext = avcodec_alloc_context3(avCodec); TORCH_CHECK(codecContext != nullptr); - streamInfo.codecContext.reset(codecContext); + streamInfo.codecContext = SharedAVCodecContext( + codecContext, [](AVCodecContext* ctx) { avcodec_free_context(&ctx); }); int retVal = avcodec_parameters_to_context( streamInfo.codecContext.get(), streamInfo.stream->codecpar); @@ -453,18 +453,19 @@ void SingleStreamDecoder::addStream( // Note that we must make sure to register the harware device context // with the codec context before calling avcodec_open2(). Otherwise, decoding // will happen on the CPU and not the hardware device. - deviceInterface_->registerHardwareDeviceWithCodec(codecContext); + deviceInterface_->registerHardwareDeviceWithCodec( + streamInfo.codecContext.get()); retVal = avcodec_open2(streamInfo.codecContext.get(), avCodec, nullptr); TORCH_CHECK(retVal >= AVSUCCESS, getFFMPEGErrorStringFromErrorCode(retVal)); - codecContext->time_base = streamInfo.stream->time_base; + streamInfo.codecContext->time_base = streamInfo.stream->time_base; - // Set the codec context on the device interface for default FFmpeg - // implementations - deviceInterface_->setCodecContext(codecContext); + // Initialize the device interface with the codec context + deviceInterface_->initialize( + streamInfo.stream, formatContext_, streamInfo.codecContext); containerMetadata_.allStreamMetadata[activeStreamIndex_].codecName = - std::string(avcodec_get_name(codecContext->codec_id)); + std::string(avcodec_get_name(streamInfo.codecContext->codec_id)); // We will only need packets from the active stream, so we tell FFmpeg to // discard packets from the other streams. Note that av_read_frame() may still @@ -1153,8 +1154,6 @@ void SingleStreamDecoder::maybeSeekToBeforeDesiredPts() { getFFMPEGErrorStringFromErrorCode(status)); decodeStats_.numFlushes++; - avcodec_flush_buffers(streamInfo.codecContext.get()); - deviceInterface_->flush(); } diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index 48821ff09..10f820550 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -221,7 +221,7 @@ class SingleStreamDecoder { AVMediaType avMediaType = AVMEDIA_TYPE_UNKNOWN; AVRational timeBase = {}; - UniqueAVCodecContext codecContext; + SharedAVCodecContext codecContext; // The FrameInfo indices we built when scanFileAndUpdateMetadataAndIndex was // called. From 44edb930df6d083893469a681615e8fc9e1a8c93 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Wed, 15 Oct 2025 12:18:07 -0700 Subject: [PATCH 04/18] address feedback --- src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 3 +-- src/torchcodec/_core/DeviceInterface.h | 7 +++---- src/torchcodec/_core/FFMPEGCommon.cpp | 9 --------- src/torchcodec/_core/FFMPEGCommon.h | 1 - 4 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 3cc449d09..0bdd91a23 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -232,9 +232,8 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() { void BetaCudaDeviceInterface::initialize( const AVStream* avStream, const UniqueDecodingAVFormatContext& avFormatCtx, - const SharedAVCodecContext& codecContext) { + [[maybe_unused]] const SharedAVCodecContext& codecContext) { TORCH_CHECK(avStream != nullptr, "AVStream cannot be null"); - codecContext_ = codecContext; timeBase_ = avStream->time_base; frameRateAvgFromFFmpeg_ = avStream->r_frame_rate; diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h index b6c88438e..25a36a40f 100644 --- a/src/torchcodec/_core/DeviceInterface.h +++ b/src/torchcodec/_core/DeviceInterface.h @@ -95,10 +95,9 @@ class DeviceInterface { // Returns AVSUCCESS on success, or other AVERROR on failure // Default implementation uses FFmpeg directly virtual int sendEOFPacket() { - if (!codecContext_) { - TORCH_CHECK(false, "Codec context not available for EOF packet sending"); - return AVERROR(EINVAL); - } + TORCH_CHECK( + codecContext_ != nullptr, + "Codec context not available for default EOF packet sending"); return avcodec_send_packet(codecContext_.get(), nullptr); } diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp index 40f918fe6..97ff082e1 100644 --- a/src/torchcodec/_core/FFMPEGCommon.cpp +++ b/src/torchcodec/_core/FFMPEGCommon.cpp @@ -149,15 +149,6 @@ int getNumChannels(const UniqueAVFrame& avFrame) { #endif } -int getNumChannels(const UniqueAVCodecContext& avCodecContext) { -#if LIBAVFILTER_VERSION_MAJOR > 8 || \ - (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44) - return avCodecContext->ch_layout.nb_channels; -#else - return avCodecContext->channels; -#endif -} - int getNumChannels(const SharedAVCodecContext& avCodecContext) { #if LIBAVFILTER_VERSION_MAJOR > 8 || \ (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44) diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h index 5ca1e732c..448333e20 100644 --- a/src/torchcodec/_core/FFMPEGCommon.h +++ b/src/torchcodec/_core/FFMPEGCommon.h @@ -173,7 +173,6 @@ const AVSampleFormat* getSupportedOutputSampleFormats(const AVCodec& avCodec); const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec); int getNumChannels(const UniqueAVFrame& avFrame); -int getNumChannels(const UniqueAVCodecContext& avCodecContext); int getNumChannels(const SharedAVCodecContext& avCodecContext); void setDefaultChannelLayout( From b09b203eb8bb45c295432252f8d35e78a2926fbd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 16 Oct 2025 14:25:24 +0100 Subject: [PATCH 05/18] Disgustingly hacky POC --- .../_core/BetaCudaDeviceInterface.cpp | 214 +++++++++++++----- .../_core/BetaCudaDeviceInterface.h | 6 + test/test_decoders.py | 13 +- 3 files changed, 172 insertions(+), 61 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index d55bb1137..6fd5a6ee3 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -52,7 +52,9 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) { return decoder->frameReadyInDisplayOrder(dispInfo); } -static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { +static UniqueCUvideodecoder createDecoder( + CUVIDEOFORMAT* videoFormat, + bool* capabilityCheckFailed = nullptr) { // Check decoder capabilities - same checks as DALI auto caps = CUVIDDECODECAPS{}; caps.eCodecType = videoFormat->codec; @@ -61,65 +63,84 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { CUresult result = cuvidGetDecoderCaps(&caps); TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result); - TORCH_CHECK( - caps.bIsSupported, - "Codec configuration not supported on this GPU. " - "Codec: ", - static_cast(videoFormat->codec), - ", chroma format: ", - static_cast(videoFormat->chroma_format), - ", bit depth: ", - videoFormat->bit_depth_luma_minus8 + 8); - - TORCH_CHECK( - videoFormat->coded_width >= caps.nMinWidth && - videoFormat->coded_height >= caps.nMinHeight, - "Video is too small in at least one dimension. Provided: ", - videoFormat->coded_width, - "x", - videoFormat->coded_height, - " vs supported:", - caps.nMinWidth, - "x", - caps.nMinHeight); + if (!caps.bIsSupported) { + if (capabilityCheckFailed) { + *capabilityCheckFailed = true; + return nullptr; + } + TORCH_CHECK( + false, + "Codec configuration not supported on this GPU. " + "Codec: ", + static_cast(videoFormat->codec), + ", chroma format: ", + static_cast(videoFormat->chroma_format), + ", bit depth: ", + videoFormat->bit_depth_luma_minus8 + 8); + } - TORCH_CHECK( - videoFormat->coded_width <= caps.nMaxWidth && - videoFormat->coded_height <= caps.nMaxHeight, - "Video is too large in at least one dimension. Provided: ", - videoFormat->coded_width, - "x", - videoFormat->coded_height, - " vs supported:", - caps.nMaxWidth, - "x", - caps.nMaxHeight); + if (videoFormat->coded_width < caps.nMinWidth || + videoFormat->coded_height < caps.nMinHeight || + videoFormat->coded_width > caps.nMaxWidth || + videoFormat->coded_height > caps.nMaxHeight) { + if (capabilityCheckFailed) { + *capabilityCheckFailed = true; + return nullptr; + } + TORCH_CHECK( + false, + "Video dimensions not supported. Provided: ", + videoFormat->coded_width, + "x", + videoFormat->coded_height, + " vs supported: ", + caps.nMinWidth, + "x", + caps.nMinHeight, + " to ", + caps.nMaxWidth, + "x", + caps.nMaxHeight); + } // See nMaxMBCount in cuviddec.h constexpr unsigned int macroblockConstant = 256; - TORCH_CHECK( - videoFormat->coded_width * videoFormat->coded_height / - macroblockConstant <= - caps.nMaxMBCount, - "Video is too large (too many macroblocks). " - "Provided (width * height / ", - macroblockConstant, - "): ", - videoFormat->coded_width * videoFormat->coded_height / macroblockConstant, - " vs supported:", - caps.nMaxMBCount); + if (videoFormat->coded_width * videoFormat->coded_height / + macroblockConstant > + caps.nMaxMBCount) { + if (capabilityCheckFailed) { + *capabilityCheckFailed = true; + return nullptr; + } + TORCH_CHECK( + false, + "Video is too large (too many macroblocks). " + "Provided (width * height / ", + macroblockConstant, + "): ", + videoFormat->coded_width * videoFormat->coded_height / + macroblockConstant, + " vs supported:", + caps.nMaxMBCount); + } // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make // sure it's actually supported. - TORCH_CHECK( - (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1, - "NV12 output format is not supported for this configuration. ", - "Codec: ", - static_cast(videoFormat->codec), - ", chroma format: ", - static_cast(videoFormat->chroma_format), - ", bit depth: ", - videoFormat->bit_depth_luma_minus8 + 8); + if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) { + if (capabilityCheckFailed) { + *capabilityCheckFailed = true; + return nullptr; + } + TORCH_CHECK( + false, + "NV12 output format is not supported for this configuration. ", + "Codec: ", + static_cast(videoFormat->codec), + ", chroma format: ", + static_cast(videoFormat->chroma_format), + ", bit depth: ", + videoFormat->bit_depth_luma_minus8 + 8); + } // Decoder creation parameters, most are taken from DALI CUVIDDECODECREATEINFO decoderParams = {}; @@ -225,6 +246,11 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() { videoParser_ = nullptr; } + // Clean up buffered packet if it wasn't used (commented out to avoid + // potential issues) if (bufferedFirstPacket_) { + // av_packet_free(&bufferedFirstPacket_); + // } + returnNppStreamContextToCache(device_, std::move(nppCtx_)); } @@ -239,6 +265,16 @@ void BetaCudaDeviceInterface::initialize( const AVCodecParameters* codecPar = avStream->codecpar; TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null"); + // Initialize CPU interface for potential fallback + cpuInterface_ = createDeviceInterface(torch::kCPU); + TORCH_CHECK( + cpuInterface_ != nullptr, "Failed to create CPU device interface"); + cpuInterface_->initialize(avStream, avFormatCtx, codecContext); + cpuInterface_->initializeVideo( + VideoStreamOptions(), + {}, + /*resizedOutputDims=*/std::nullopt); + initializeBSF(codecPar, avFormatCtx); // Create parser. Default values that aren't obvious are taken from DALI. @@ -368,7 +404,14 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { // TODONVDEC P2: consider re-configuring an existing decoder instead of // re-creating one. See docs, see DALI. Re-configuration doesn't seem to // be enabled in DALI by default. - decoder_ = createDecoder(videoFormat); + bool capabilityCheckFailed = false; + decoder_ = createDecoder(videoFormat, &capabilityCheckFailed); + + if (capabilityCheckFailed) { + usingCpuFallback_ = true; + capabilityCheckPending_ = false; + return static_cast(videoFormat_.min_num_decode_surfaces); + } } TORCH_CHECK(decoder_, "Failed to get or create decoder"); @@ -383,10 +426,25 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to // the NVCUVID parser. int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { + printf("usingCpuFallback_: %d\n", usingCpuFallback_); + if (usingCpuFallback_) { + return cpuInterface_->sendPacket(packet); + } + TORCH_CHECK( packet.get() && packet->data && packet->size > 0, "sendPacket received an empty packet, this is unexpected, please report."); + // On first packet, store a copy before sending to CUDA parser + if (capabilityCheckPending_) { + // Make a deep copy of the packet before CUDA parser potentially corrupts it + bufferedFirstPacket_ = av_packet_alloc(); + TORCH_CHECK(bufferedFirstPacket_, "Failed to allocate packet for fallback"); + int ret = av_packet_ref(bufferedFirstPacket_, packet.get()); + TORCH_CHECK(ret >= 0, "Failed to copy packet for fallback"); + capabilityCheckPending_ = false; + } + // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or // the original one if no BSF is needed. This new filtered packet must be // allocated outside of applyBSF: if it were allocated inside applyBSF, it @@ -402,10 +460,29 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { cuvidPacket.flags = CUVID_PKT_TIMESTAMP; cuvidPacket.timestamp = packetToSend->pts; - return sendCuvidPacket(cuvidPacket); + int result = sendCuvidPacket(cuvidPacket); + + // If capability check failed and we switched to CPU fallback, send buffered + // packet to CPU + if (usingCpuFallback_) { + printf("Falling back to CPU!!!! And re-sending packet\n"); + TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet"); + // Create AutoAVPacket, then ReferenceAVPacket to access get() method + AutoAVPacket autoBufferedPacket; + ReferenceAVPacket refBufferedPacket(autoBufferedPacket); + // Copy the buffered packet data + av_packet_ref(refBufferedPacket.get(), bufferedFirstPacket_); + return cpuInterface_->sendPacket(refBufferedPacket); + } + + return result; } int BetaCudaDeviceInterface::sendEOFPacket() { + if (usingCpuFallback_) { + return cpuInterface_->sendEOFPacket(); + } + CUVIDSOURCEDATAPACKET cuvidPacket = {}; cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM; eofSent_ = true; @@ -450,6 +527,9 @@ ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF( // given frame. It means we can send that frame to be decoded by the hardware // NVDEC decoder by calling cuvidDecodePicture which is non-blocking. int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) { + if (usingCpuFallback_) { + return 1; // success + } TORCH_CHECK(picParams != nullptr, "Invalid picture parameters"); TORCH_CHECK(decoder_, "Decoder not initialized before picture decode"); // Send frame to be decoded by NVDEC - non-blocking call. @@ -467,6 +547,10 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder( // Moral equivalent of avcodec_receive_frame(). int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) { + if (usingCpuFallback_) { + return cpuInterface_->receiveFrame(avFrame); + } + if (readyFrames_.empty()) { // No frame found, instruct caller to try again later after sending more // packets, or to stop if EOF was already sent. @@ -601,6 +685,11 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame( } void BetaCudaDeviceInterface::flush() { + if (usingCpuFallback_) { + cpuInterface_->flush(); + return; + } + // The NVCUVID docs mention that after seeking, i.e. when flush() is called, // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs // don't say whether this should be an empty packet, or whether it should be a @@ -618,6 +707,21 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) { + if (usingCpuFallback_) { + // CPU decoded frame - need to do CPU color conversion then transfer to GPU + FrameOutput cpuFrameOutput; + cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput); + + // Transfer CPU frame to GPU + if (preAllocatedOutputTensor.has_value()) { + preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data); + frameOutput.data = preAllocatedOutputTensor.value(); + } else { + frameOutput.data = cpuFrameOutput.data.to(device_); + } + return; + } + // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24(). TORCH_CHECK( diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index fb01415d4..b03814268 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -94,6 +94,12 @@ class BetaCudaDeviceInterface : public DeviceInterface { // NPP context for color conversion UniqueNppContext nppCtx_; + + // CPU fallback support + std::unique_ptr cpuInterface_; + bool usingCpuFallback_ = false; + bool capabilityCheckPending_ = true; + AVPacket* bufferedFirstPacket_ = nullptr; }; } // namespace facebook::torchcodec diff --git a/test/test_decoders.py b/test/test_decoders.py index f9c7d2ff6..52d329e21 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1695,12 +1695,13 @@ def test_beta_cuda_interface_small_h265(self): # the ffmpeg interface: this video isn't supported by NVDEC, but in the # ffmpeg interface, FFMPEG fallsback to the CPU while we don't. - VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) - with pytest.raises( - RuntimeError, - match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144", - ): - VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0) + print() + # VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) + # with pytest.raises( + # RuntimeError, + # match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144", + # ): + VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0) @needs_cuda def test_beta_cuda_interface_error(self): From 82f5807a02b01ab335c73468b433be578f7d78d6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 16 Oct 2025 14:48:01 +0100 Subject: [PATCH 06/18] WIP --- .../_core/BetaCudaDeviceInterface.cpp | 70 +++---------------- test/test_decoders.py | 5 +- 2 files changed, 13 insertions(+), 62 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 6fd5a6ee3..824809ff4 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -54,7 +54,7 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) { static UniqueCUvideodecoder createDecoder( CUVIDEOFORMAT* videoFormat, - bool* capabilityCheckFailed = nullptr) { + bool* capabilityCheckFailed) { // Check decoder capabilities - same checks as DALI auto caps = CUVIDDECODECAPS{}; caps.eCodecType = videoFormat->codec; @@ -64,43 +64,16 @@ static UniqueCUvideodecoder createDecoder( TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result); if (!caps.bIsSupported) { - if (capabilityCheckFailed) { - *capabilityCheckFailed = true; - return nullptr; - } - TORCH_CHECK( - false, - "Codec configuration not supported on this GPU. " - "Codec: ", - static_cast(videoFormat->codec), - ", chroma format: ", - static_cast(videoFormat->chroma_format), - ", bit depth: ", - videoFormat->bit_depth_luma_minus8 + 8); + *capabilityCheckFailed = true; + return nullptr; } if (videoFormat->coded_width < caps.nMinWidth || videoFormat->coded_height < caps.nMinHeight || videoFormat->coded_width > caps.nMaxWidth || videoFormat->coded_height > caps.nMaxHeight) { - if (capabilityCheckFailed) { - *capabilityCheckFailed = true; - return nullptr; - } - TORCH_CHECK( - false, - "Video dimensions not supported. Provided: ", - videoFormat->coded_width, - "x", - videoFormat->coded_height, - " vs supported: ", - caps.nMinWidth, - "x", - caps.nMinHeight, - " to ", - caps.nMaxWidth, - "x", - caps.nMaxHeight); + *capabilityCheckFailed = true; + return nullptr; } // See nMaxMBCount in cuviddec.h @@ -108,38 +81,15 @@ static UniqueCUvideodecoder createDecoder( if (videoFormat->coded_width * videoFormat->coded_height / macroblockConstant > caps.nMaxMBCount) { - if (capabilityCheckFailed) { - *capabilityCheckFailed = true; - return nullptr; - } - TORCH_CHECK( - false, - "Video is too large (too many macroblocks). " - "Provided (width * height / ", - macroblockConstant, - "): ", - videoFormat->coded_width * videoFormat->coded_height / - macroblockConstant, - " vs supported:", - caps.nMaxMBCount); + *capabilityCheckFailed = true; + return nullptr; } // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make // sure it's actually supported. if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) { - if (capabilityCheckFailed) { - *capabilityCheckFailed = true; - return nullptr; - } - TORCH_CHECK( - false, - "NV12 output format is not supported for this configuration. ", - "Codec: ", - static_cast(videoFormat->codec), - ", chroma format: ", - static_cast(videoFormat->chroma_format), - ", bit depth: ", - videoFormat->bit_depth_luma_minus8 + 8); + *capabilityCheckFailed = true; + return nullptr; } // Decoder creation parameters, most are taken from DALI @@ -466,7 +416,7 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { // packet to CPU if (usingCpuFallback_) { printf("Falling back to CPU!!!! And re-sending packet\n"); - TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet"); + // TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet"); // Create AutoAVPacket, then ReferenceAVPacket to access get() method AutoAVPacket autoBufferedPacket; ReferenceAVPacket refBufferedPacket(autoBufferedPacket); diff --git a/test/test_decoders.py b/test/test_decoders.py index 52d329e21..10e13b0cc 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1696,12 +1696,13 @@ def test_beta_cuda_interface_small_h265(self): # ffmpeg interface, FFMPEG fallsback to the CPU while we don't. print() - # VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) + a = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) # with pytest.raises( # RuntimeError, # match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144", # ): - VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0) + b = VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0) + torch.testing.assert_close(a.data, b.data, rtol=0, atol=0) @needs_cuda def test_beta_cuda_interface_error(self): From 936de765d3c1ac35e5b903dad6956d6c0821f635 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 16 Oct 2025 14:49:05 +0100 Subject: [PATCH 07/18] WIP --- src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 824809ff4..b4e3529b2 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -360,7 +360,7 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { if (capabilityCheckFailed) { usingCpuFallback_ = true; capabilityCheckPending_ = false; - return static_cast(videoFormat_.min_num_decode_surfaces); + return 0; } } From c5939d2d2214052a9b6a9a7aee31b64b6fbd50f3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 16 Oct 2025 17:22:22 +0100 Subject: [PATCH 08/18] WIP --- .../_core/BetaCudaDeviceInterface.cpp | 83 +++++++++---------- 1 file changed, 37 insertions(+), 46 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index b4e3529b2..b4cf4de1f 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -52,46 +52,7 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) { return decoder->frameReadyInDisplayOrder(dispInfo); } -static UniqueCUvideodecoder createDecoder( - CUVIDEOFORMAT* videoFormat, - bool* capabilityCheckFailed) { - // Check decoder capabilities - same checks as DALI - auto caps = CUVIDDECODECAPS{}; - caps.eCodecType = videoFormat->codec; - caps.eChromaFormat = videoFormat->chroma_format; - caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8; - CUresult result = cuvidGetDecoderCaps(&caps); - TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result); - - if (!caps.bIsSupported) { - *capabilityCheckFailed = true; - return nullptr; - } - - if (videoFormat->coded_width < caps.nMinWidth || - videoFormat->coded_height < caps.nMinHeight || - videoFormat->coded_width > caps.nMaxWidth || - videoFormat->coded_height > caps.nMaxHeight) { - *capabilityCheckFailed = true; - return nullptr; - } - - // See nMaxMBCount in cuviddec.h - constexpr unsigned int macroblockConstant = 256; - if (videoFormat->coded_width * videoFormat->coded_height / - macroblockConstant > - caps.nMaxMBCount) { - *capabilityCheckFailed = true; - return nullptr; - } - - // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make - // sure it's actually supported. - if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) { - *capabilityCheckFailed = true; - return nullptr; - } - +static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { // Decoder creation parameters, most are taken from DALI CUVIDDECODECREATEINFO decoderParams = {}; decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8; @@ -128,12 +89,42 @@ static UniqueCUvideodecoder createDecoder( decoderParams.display_area.bottom = videoFormat->display_area.bottom; CUvideodecoder* decoder = new CUvideodecoder(); - result = cuvidCreateDecoder(decoder, &decoderParams); + CUresult result = cuvidCreateDecoder(decoder, &decoderParams); TORCH_CHECK( result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result); return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{}); } +bool videoIsSupported(CUVIDEOFORMAT* videoFormat) { + // Check decoder capabilities - same checks as DALI + auto caps = CUVIDDECODECAPS{}; + caps.eCodecType = videoFormat->codec; + caps.eChromaFormat = videoFormat->chroma_format; + caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8; + CUresult result = cuvidGetDecoderCaps(&caps); + TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result); + + if (!caps.bIsSupported) { + return false; + } + + if (!(videoFormat->coded_width >= caps.nMinWidth && + videoFormat->coded_height >= caps.nMinHeight && + videoFormat->coded_width <= caps.nMaxWidth && + videoFormat->coded_height <= caps.nMaxHeight)) { + return false; + } + + constexpr unsigned int macroblockConstant = 256; + if (!(videoFormat->coded_width * videoFormat->coded_height / + macroblockConstant <= + caps.nMaxMBCount)) { + return false; + } + + return true; +} + cudaVideoCodec validateCodecSupport(AVCodecID codecId) { switch (codecId) { case AV_CODEC_ID_H264: @@ -354,14 +345,14 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { // TODONVDEC P2: consider re-configuring an existing decoder instead of // re-creating one. See docs, see DALI. Re-configuration doesn't seem to // be enabled in DALI by default. - bool capabilityCheckFailed = false; - decoder_ = createDecoder(videoFormat, &capabilityCheckFailed); - - if (capabilityCheckFailed) { + // Check if NVDEC supports this video configuration + if (!videoIsSupported(videoFormat)) { usingCpuFallback_ = true; capabilityCheckPending_ = false; - return 0; + return static_cast(videoFormat_.min_num_decode_surfaces); } + + decoder_ = createDecoder(videoFormat); } TORCH_CHECK(decoder_, "Failed to get or create decoder"); From 55dffd96dd17199365204955485584fc2e1e8380 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 16 Oct 2025 17:33:31 +0100 Subject: [PATCH 09/18] WIP --- src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 13 +++++++------ src/torchcodec/_core/BetaCudaDeviceInterface.h | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index b4cf4de1f..e8bdc7bd7 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -54,6 +54,8 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) { static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { // Decoder creation parameters, most are taken from DALI + // Callers should ensure video is supported by calling videoIsSupported() first. + CUVIDDECODECREATEINFO decoderParams = {}; decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8; decoderParams.ChromaFormat = videoFormat->chroma_format; @@ -345,11 +347,10 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { // TODONVDEC P2: consider re-configuring an existing decoder instead of // re-creating one. See docs, see DALI. Re-configuration doesn't seem to // be enabled in DALI by default. - // Check if NVDEC supports this video configuration + if (!videoIsSupported(videoFormat)) { usingCpuFallback_ = true; - capabilityCheckPending_ = false; - return static_cast(videoFormat_.min_num_decode_surfaces); + return 0; } decoder_ = createDecoder(videoFormat); @@ -377,13 +378,13 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { "sendPacket received an empty packet, this is unexpected, please report."); // On first packet, store a copy before sending to CUDA parser - if (capabilityCheckPending_) { + if (isFirstPacket_) { // Make a deep copy of the packet before CUDA parser potentially corrupts it bufferedFirstPacket_ = av_packet_alloc(); TORCH_CHECK(bufferedFirstPacket_, "Failed to allocate packet for fallback"); int ret = av_packet_ref(bufferedFirstPacket_, packet.get()); TORCH_CHECK(ret >= 0, "Failed to copy packet for fallback"); - capabilityCheckPending_ = false; + isFirstPacket_ = false; } // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or @@ -407,7 +408,7 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { // packet to CPU if (usingCpuFallback_) { printf("Falling back to CPU!!!! And re-sending packet\n"); - // TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet"); + // // TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet"); // Create AutoAVPacket, then ReferenceAVPacket to access get() method AutoAVPacket autoBufferedPacket; ReferenceAVPacket refBufferedPacket(autoBufferedPacket); diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index b03814268..5b3f7523c 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -98,7 +98,7 @@ class BetaCudaDeviceInterface : public DeviceInterface { // CPU fallback support std::unique_ptr cpuInterface_; bool usingCpuFallback_ = false; - bool capabilityCheckPending_ = true; + bool isFirstPacket_ = true; AVPacket* bufferedFirstPacket_ = nullptr; }; From e58746a1d26e078a884c4b380b912806110b6a01 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 16 Oct 2025 19:17:50 +0100 Subject: [PATCH 10/18] Init checks - yummy --- .../_core/BetaCudaDeviceInterface.cpp | 184 +++++++++++------- .../_core/BetaCudaDeviceInterface.h | 6 +- 2 files changed, 119 insertions(+), 71 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index e8bdc7bd7..d45ba6671 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -54,7 +54,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) { static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { // Decoder creation parameters, most are taken from DALI - // Callers should ensure video is supported by calling videoIsSupported() first. CUVIDDECODECREATEINFO decoderParams = {}; decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8; @@ -127,7 +126,34 @@ bool videoIsSupported(CUVIDEOFORMAT* videoFormat) { return true; } -cudaVideoCodec validateCodecSupport(AVCodecID codecId) { +std::optional mapChromaFormat( + const AVPixFmtDescriptor* desc) { + if (!desc) { + return std::nullopt; + } + + if (desc->nb_components == 1) { + return cudaVideoChromaFormat_Monochrome; + } + + // Check if it's YUV (has chroma planes and not RGB) + if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) { + if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) { + // 4:4:4 (no subsampling) + return cudaVideoChromaFormat_444; + } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) { + // 4:2:0 (2x2 subsampling) + return cudaVideoChromaFormat_420; + } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) { + // 4:2:2 (2x1 subsampling) + return cudaVideoChromaFormat_422; + } + } + + return std::nullopt; +} + +std::optional validateCodecSupport(AVCodecID codecId) { switch (codecId) { case AV_CODEC_ID_H264: return cudaVideoCodec_H264; @@ -153,10 +179,68 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) { // return cudaVideoCodec_JPEG; // case AV_CODEC_ID_VC1: // return cudaVideoCodec_VC1; - default: { - TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId)); - } + default: + return std::nullopt; + } +} + +bool shouldFallbackToCPU(const SharedAVCodecContext& codecContext) { + auto codecType = validateCodecSupport(codecContext->codec_id); + if (!codecType.has_value()) { + return true; + } + + const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt); + if (!desc) { + return true; + } + + auto chromaFormat = mapChromaFormat(desc); + if (!chromaFormat.has_value()) { + return true; + } + + auto caps = CUVIDDECODECAPS{}; + caps.eCodecType = codecType.value(); + caps.eChromaFormat = chromaFormat.value(); + caps.nBitDepthMinus8 = desc->comp[0].depth - 8; + + CUresult result = cuvidGetDecoderCaps(&caps); + if (result != CUDA_SUCCESS) { + return true; + } + + if (!caps.bIsSupported) { + return true; + } + + if (!(static_cast(codecContext->coded_width) >= + caps.nMinWidth && + static_cast(codecContext->coded_height) >= + caps.nMinHeight && + static_cast(codecContext->coded_width) <= + caps.nMaxWidth && + static_cast(codecContext->coded_height) <= + caps.nMaxHeight)) { + return true; } + + // See nMaxMBCount in cuviddec.h + constexpr unsigned int macroblockConstant = 256; + if (!(static_cast( + codecContext->coded_width * codecContext->coded_height) / + macroblockConstant <= + caps.nMaxMBCount)) { + return true; + } + + // We explicitly request NV12 output format in createDecoder(), so we need to + // make sure it's supported. + if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) { + return true; + } + + return false; } } // namespace @@ -189,11 +273,6 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() { videoParser_ = nullptr; } - // Clean up buffered packet if it wasn't used (commented out to avoid - // potential issues) if (bufferedFirstPacket_) { - // av_packet_free(&bufferedFirstPacket_); - // } - returnNppStreamContextToCache(device_, std::move(nppCtx_)); } @@ -201,6 +280,19 @@ void BetaCudaDeviceInterface::initialize( const AVStream* avStream, const UniqueDecodingAVFormatContext& avFormatCtx, [[maybe_unused]] const SharedAVCodecContext& codecContext) { + if (shouldFallbackToCPU(codecContext)) { + cpuFallback_ = createDeviceInterface(torch::kCPU); + TORCH_CHECK( + cpuFallback_ != nullptr, "Failed to create CPU device interface"); + cpuFallback_->initialize(avStream, avFormatCtx, codecContext); + cpuFallback_->initializeVideo( + VideoStreamOptions(), + {}, + /*resizedOutputDims=*/std::nullopt); + // We'll always use the CPU fallback from now on, so we can return early. + return; + } + TORCH_CHECK(avStream != nullptr, "AVStream cannot be null"); timeBase_ = avStream->time_base; frameRateAvgFromFFmpeg_ = avStream->r_frame_rate; @@ -208,21 +300,15 @@ void BetaCudaDeviceInterface::initialize( const AVCodecParameters* codecPar = avStream->codecpar; TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null"); - // Initialize CPU interface for potential fallback - cpuInterface_ = createDeviceInterface(torch::kCPU); - TORCH_CHECK( - cpuInterface_ != nullptr, "Failed to create CPU device interface"); - cpuInterface_->initialize(avStream, avFormatCtx, codecContext); - cpuInterface_->initializeVideo( - VideoStreamOptions(), - {}, - /*resizedOutputDims=*/std::nullopt); - initializeBSF(codecPar, avFormatCtx); // Create parser. Default values that aren't obvious are taken from DALI. CUVIDPARSERPARAMS parserParams = {}; - parserParams.CodecType = validateCodecSupport(codecPar->codec_id); + auto codecType = validateCodecSupport(codecPar->codec_id); + TORCH_CHECK( + codecType.has_value(), + "This should never happen, we should be using the CPU fallback by now. Please report a bug."); + parserParams.CodecType = codecType.value(); parserParams.ulMaxNumDecodeSurfaces = 8; parserParams.ulMaxDisplayDelay = 0; // Callback setup, all are triggered by the parser within a call @@ -347,11 +433,6 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { // TODONVDEC P2: consider re-configuring an existing decoder instead of // re-creating one. See docs, see DALI. Re-configuration doesn't seem to // be enabled in DALI by default. - - if (!videoIsSupported(videoFormat)) { - usingCpuFallback_ = true; - return 0; - } decoder_ = createDecoder(videoFormat); } @@ -368,25 +449,14 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to // the NVCUVID parser. int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { - printf("usingCpuFallback_: %d\n", usingCpuFallback_); - if (usingCpuFallback_) { - return cpuInterface_->sendPacket(packet); + if (cpuFallback_) { + return cpuFallback_->sendPacket(packet); } TORCH_CHECK( packet.get() && packet->data && packet->size > 0, "sendPacket received an empty packet, this is unexpected, please report."); - // On first packet, store a copy before sending to CUDA parser - if (isFirstPacket_) { - // Make a deep copy of the packet before CUDA parser potentially corrupts it - bufferedFirstPacket_ = av_packet_alloc(); - TORCH_CHECK(bufferedFirstPacket_, "Failed to allocate packet for fallback"); - int ret = av_packet_ref(bufferedFirstPacket_, packet.get()); - TORCH_CHECK(ret >= 0, "Failed to copy packet for fallback"); - isFirstPacket_ = false; - } - // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or // the original one if no BSF is needed. This new filtered packet must be // allocated outside of applyBSF: if it were allocated inside applyBSF, it @@ -402,27 +472,12 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { cuvidPacket.flags = CUVID_PKT_TIMESTAMP; cuvidPacket.timestamp = packetToSend->pts; - int result = sendCuvidPacket(cuvidPacket); - - // If capability check failed and we switched to CPU fallback, send buffered - // packet to CPU - if (usingCpuFallback_) { - printf("Falling back to CPU!!!! And re-sending packet\n"); - // // TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet"); - // Create AutoAVPacket, then ReferenceAVPacket to access get() method - AutoAVPacket autoBufferedPacket; - ReferenceAVPacket refBufferedPacket(autoBufferedPacket); - // Copy the buffered packet data - av_packet_ref(refBufferedPacket.get(), bufferedFirstPacket_); - return cpuInterface_->sendPacket(refBufferedPacket); - } - - return result; + return sendCuvidPacket(cuvidPacket); } int BetaCudaDeviceInterface::sendEOFPacket() { - if (usingCpuFallback_) { - return cpuInterface_->sendEOFPacket(); + if (cpuFallback_) { + return cpuFallback_->sendEOFPacket(); } CUVIDSOURCEDATAPACKET cuvidPacket = {}; @@ -469,9 +524,6 @@ ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF( // given frame. It means we can send that frame to be decoded by the hardware // NVDEC decoder by calling cuvidDecodePicture which is non-blocking. int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) { - if (usingCpuFallback_) { - return 1; // success - } TORCH_CHECK(picParams != nullptr, "Invalid picture parameters"); TORCH_CHECK(decoder_, "Decoder not initialized before picture decode"); // Send frame to be decoded by NVDEC - non-blocking call. @@ -489,8 +541,8 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder( // Moral equivalent of avcodec_receive_frame(). int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) { - if (usingCpuFallback_) { - return cpuInterface_->receiveFrame(avFrame); + if (cpuFallback_) { + return cpuFallback_->receiveFrame(avFrame); } if (readyFrames_.empty()) { @@ -627,8 +679,8 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame( } void BetaCudaDeviceInterface::flush() { - if (usingCpuFallback_) { - cpuInterface_->flush(); + if (cpuFallback_) { + cpuFallback_->flush(); return; } @@ -649,10 +701,10 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) { - if (usingCpuFallback_) { + if (cpuFallback_) { // CPU decoded frame - need to do CPU color conversion then transfer to GPU FrameOutput cpuFrameOutput; - cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput); + cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput); // Transfer CPU frame to GPU if (preAllocatedOutputTensor.has_value()) { diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index 5b3f7523c..7424a877d 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -95,11 +95,7 @@ class BetaCudaDeviceInterface : public DeviceInterface { // NPP context for color conversion UniqueNppContext nppCtx_; - // CPU fallback support - std::unique_ptr cpuInterface_; - bool usingCpuFallback_ = false; - bool isFirstPacket_ = true; - AVPacket* bufferedFirstPacket_ = nullptr; + std::unique_ptr cpuFallback_; }; } // namespace facebook::torchcodec From 8373eec7bc740f391804897296d6d00d6accbadc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 16 Oct 2025 19:20:14 +0100 Subject: [PATCH 11/18] WIP --- .../_core/BetaCudaDeviceInterface.cpp | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index d45ba6671..e36a47925 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -96,36 +96,6 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{}); } -bool videoIsSupported(CUVIDEOFORMAT* videoFormat) { - // Check decoder capabilities - same checks as DALI - auto caps = CUVIDDECODECAPS{}; - caps.eCodecType = videoFormat->codec; - caps.eChromaFormat = videoFormat->chroma_format; - caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8; - CUresult result = cuvidGetDecoderCaps(&caps); - TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result); - - if (!caps.bIsSupported) { - return false; - } - - if (!(videoFormat->coded_width >= caps.nMinWidth && - videoFormat->coded_height >= caps.nMinHeight && - videoFormat->coded_width <= caps.nMaxWidth && - videoFormat->coded_height <= caps.nMaxHeight)) { - return false; - } - - constexpr unsigned int macroblockConstant = 256; - if (!(videoFormat->coded_width * videoFormat->coded_height / - macroblockConstant <= - caps.nMaxMBCount)) { - return false; - } - - return true; -} - std::optional mapChromaFormat( const AVPixFmtDescriptor* desc) { if (!desc) { From 4178e2350832fe48820cabd75de0d43b8cf32e62 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 16 Oct 2025 19:37:51 +0100 Subject: [PATCH 12/18] WIP --- .../_core/BetaCudaDeviceInterface.cpp | 42 ++++++++----------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index e36a47925..17e78abda 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -98,25 +98,18 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { std::optional mapChromaFormat( const AVPixFmtDescriptor* desc) { - if (!desc) { - return std::nullopt; - } + TORCH_CHECK(desc != nullptr, "desc can't be null"); if (desc->nb_components == 1) { return cudaVideoChromaFormat_Monochrome; - } - - // Check if it's YUV (has chroma planes and not RGB) - if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) { + } else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) { + // Make sure it's YUV: has chroma planes and isn't RGB if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) { - // 4:4:4 (no subsampling) - return cudaVideoChromaFormat_444; + return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4 } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) { - // 4:2:0 (2x2 subsampling) - return cudaVideoChromaFormat_420; + return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0 } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) { - // 4:2:2 (2x1 subsampling) - return cudaVideoChromaFormat_422; + return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2 } } @@ -154,20 +147,20 @@ std::optional validateCodecSupport(AVCodecID codecId) { } } -bool shouldFallbackToCPU(const SharedAVCodecContext& codecContext) { +bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { auto codecType = validateCodecSupport(codecContext->codec_id); if (!codecType.has_value()) { - return true; + return false; } const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt); if (!desc) { - return true; + return false; } auto chromaFormat = mapChromaFormat(desc); if (!chromaFormat.has_value()) { - return true; + return false; } auto caps = CUVIDDECODECAPS{}; @@ -177,11 +170,11 @@ bool shouldFallbackToCPU(const SharedAVCodecContext& codecContext) { CUresult result = cuvidGetDecoderCaps(&caps); if (result != CUDA_SUCCESS) { - return true; + return false; } if (!caps.bIsSupported) { - return true; + return false; } if (!(static_cast(codecContext->coded_width) >= @@ -192,7 +185,7 @@ bool shouldFallbackToCPU(const SharedAVCodecContext& codecContext) { caps.nMaxWidth && static_cast(codecContext->coded_height) <= caps.nMaxHeight)) { - return true; + return false; } // See nMaxMBCount in cuviddec.h @@ -201,16 +194,16 @@ bool shouldFallbackToCPU(const SharedAVCodecContext& codecContext) { codecContext->coded_width * codecContext->coded_height) / macroblockConstant <= caps.nMaxMBCount)) { - return true; + return false; } // We explicitly request NV12 output format in createDecoder(), so we need to // make sure it's supported. if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) { - return true; + return false; } - return false; + return true; } } // namespace @@ -250,7 +243,7 @@ void BetaCudaDeviceInterface::initialize( const AVStream* avStream, const UniqueDecodingAVFormatContext& avFormatCtx, [[maybe_unused]] const SharedAVCodecContext& codecContext) { - if (shouldFallbackToCPU(codecContext)) { + if (!nativeNVDECSupport(codecContext)) { cpuFallback_ = createDeviceInterface(torch::kCPU); TORCH_CHECK( cpuFallback_ != nullptr, "Failed to create CPU device interface"); @@ -403,7 +396,6 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { // TODONVDEC P2: consider re-configuring an existing decoder instead of // re-creating one. See docs, see DALI. Re-configuration doesn't seem to // be enabled in DALI by default. - decoder_ = createDecoder(videoFormat); } From 234e1d99db810311bfdb375f450f92804d7fbd74 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 Oct 2025 11:14:35 +0100 Subject: [PATCH 13/18] Fix merge --- src/torchcodec/_core/FFMPEGCommon.h | 6 ++++++ src/torchcodec/_core/SingleStreamDecoder.cpp | 3 +-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h index 448333e20..337616ddc 100644 --- a/src/torchcodec/_core/FFMPEGCommon.h +++ b/src/torchcodec/_core/FFMPEGCommon.h @@ -73,6 +73,12 @@ using UniqueAVCodecContext = std::unique_ptr< Deleterp>; using SharedAVCodecContext = std::shared_ptr; +// create SharedAVCodecContext with custom deleter +inline SharedAVCodecContext makeSharedAVCodecContext(AVCodecContext* ctx) { + return SharedAVCodecContext( + ctx, Deleterp{}); +} + using UniqueAVFrame = std::unique_ptr>; using UniqueAVFilterGraph = std::unique_ptr< diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 0a7b89691..ba7382c67 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -440,8 +440,7 @@ void SingleStreamDecoder::addStream( AVCodecContext* codecContext = avcodec_alloc_context3(avCodec); TORCH_CHECK(codecContext != nullptr); - streamInfo.codecContext = SharedAVCodecContext( - codecContext, [](AVCodecContext* ctx) { avcodec_free_context(&ctx); }); + streamInfo.codecContext = makeSharedAVCodecContext(codecContext); int retVal = avcodec_parameters_to_context( streamInfo.codecContext.get(), streamInfo.stream->codecpar); From 82db4350748851f98e45f3b760972130362604b7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 Oct 2025 11:41:18 +0100 Subject: [PATCH 14/18] WIP --- .../_core/BetaCudaDeviceInterface.cpp | 33 ++++++++------ test/test_decoders.py | 43 +++++++++++-------- 2 files changed, 43 insertions(+), 33 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 17e78abda..747e5e82a 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -98,6 +98,8 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { std::optional mapChromaFormat( const AVPixFmtDescriptor* desc) { + // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt + // otherwise. TORCH_CHECK(desc != nullptr, "desc can't be null"); if (desc->nb_components == 1) { @@ -117,6 +119,10 @@ std::optional mapChromaFormat( } std::optional validateCodecSupport(AVCodecID codecId) { + // Return the corresponding cudaVideoCodec if supported, std::nullopt + // otherwise + // Note that we currently return nullopt (and thus fallback to CPU) for some + // codecs that are technically supported by NVDEC, see comment below. switch (codecId) { case AV_CODEC_ID_H264: return cudaVideoCodec_H264; @@ -148,6 +154,8 @@ std::optional validateCodecSupport(AVCodecID codecId) { } bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { + // Return true iff the input video stream is supported by our NVDEC + // implementation. auto codecType = validateCodecSupport(codecContext->codec_id); if (!codecType.has_value()) { return false; @@ -177,28 +185,25 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { return false; } - if (!(static_cast(codecContext->coded_width) >= - caps.nMinWidth && - static_cast(codecContext->coded_height) >= - caps.nMinHeight && - static_cast(codecContext->coded_width) <= - caps.nMaxWidth && - static_cast(codecContext->coded_height) <= - caps.nMaxHeight)) { + auto coded_width = static_cast(codecContext->coded_width); + auto coded_height = static_cast(codecContext->coded_height); + if (!(coded_width >= static_cast(caps.nMinWidth) && + coded_height >= static_cast(caps.nMinHeight) && + coded_width <= caps.nMaxWidth && coded_height <= caps.nMaxHeight)) { return false; } // See nMaxMBCount in cuviddec.h constexpr unsigned int macroblockConstant = 256; - if (!(static_cast( - codecContext->coded_width * codecContext->coded_height) / - macroblockConstant <= - caps.nMaxMBCount)) { + if (!(coded_width * coded_height / macroblockConstant <= caps.nMaxMBCount)) { return false; } - // We explicitly request NV12 output format in createDecoder(), so we need to - // make sure it's supported. + // We'll set the decoderParams.OutputFormat to NV12, so we need to make + // sure it's actually supported. + // TODO: If this fail, we could consider decoding to something else than NV12 + // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is + // what FFmpeg does. if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) { return false; } diff --git a/test/test_decoders.py b/test/test_decoders.py index 873fe68a4..07bec9ac7 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1701,19 +1701,19 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode): assert beta_frame.duration_seconds == ref_frame.duration_seconds @needs_cuda - def test_beta_cuda_interface_small_h265(self): - # Test to illustrate current difference in behavior between the BETA and - # the ffmpeg interface: this video isn't supported by NVDEC, but in the - # ffmpeg interface, FFMPEG fallsback to the CPU while we don't. - - print() - a = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) - # with pytest.raises( - # RuntimeError, - # match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144", - # ): - b = VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0) - torch.testing.assert_close(a.data, b.data, rtol=0, atol=0) + def test_beta_cuda_interface_cpu_fallback(self): + # Non-regression test for the CPU fallback behavior of the BETA CUDA + # interface. + # We know that the H265_VIDEO asset isn't supported by NVDEC, its + # dimensions are too small. We also know that the FFmpeg CUDA interface + # fallbacks to the CPU path in such cases. We assert that we fall back + # to the CPU path, too. + + ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) + with set_cuda_backend("beta"): + beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) + + torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0) @needs_cuda def test_beta_cuda_interface_error(self): @@ -1739,20 +1739,25 @@ def test_set_cuda_backend(self): assert _get_cuda_backend() == "beta" def assert_decoder_uses(decoder, *, expected_backend): + # TODO: This doesn't work anymore after + # https://github.com/meta-pytorch/torchcodec/pull/977 + # We need to define a better way to assert which backend a decoder + # is using. + return # Assert that a decoder instance is using a given backend. # # We know H265_VIDEO fails on the BETA backend while it works on the # ffmpeg one. - if expected_backend == "ffmpeg": - decoder.get_frame_at(0) # this would fail if this was BETA - else: - with pytest.raises(RuntimeError, match="Video is too small"): - decoder.get_frame_at(0) + # if expected_backend == "ffmpeg": + # decoder.get_frame_at(0) # this would fail if this was BETA + # else: + # with pytest.raises(RuntimeError, match="Video is too small"): + # decoder.get_frame_at(0) # Check that the default is the ffmpeg backend assert _get_cuda_backend() == "ffmpeg" dec = VideoDecoder(H265_VIDEO.path, device="cuda") - assert_decoder_uses(dec, expected_backend="ffmpeg") + # assert_decoder_uses(dec, expected_backend="ffmpeg") # Check the setting "beta" effectively uses the BETA backend. # We also show that the affects decoder creation only. When the decoder From f96264baf72c41e382f098a7a8676f6eb2d48bc4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 Oct 2025 12:18:10 +0100 Subject: [PATCH 15/18] slightly simplify --- src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 747e5e82a..a2e907ace 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -187,15 +187,15 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { auto coded_width = static_cast(codecContext->coded_width); auto coded_height = static_cast(codecContext->coded_height); - if (!(coded_width >= static_cast(caps.nMinWidth) && - coded_height >= static_cast(caps.nMinHeight) && - coded_width <= caps.nMaxWidth && coded_height <= caps.nMaxHeight)) { + if (coded_width < static_cast(caps.nMinWidth) || + coded_height < static_cast(caps.nMinHeight) || + coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight) { return false; } // See nMaxMBCount in cuviddec.h constexpr unsigned int macroblockConstant = 256; - if (!(coded_width * coded_height / macroblockConstant <= caps.nMaxMBCount)) { + if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount) { return false; } @@ -204,7 +204,9 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { // TODO: If this fail, we could consider decoding to something else than NV12 // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is // what FFmpeg does. - if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) { + bool supportsNV12Output = + (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1; + if (!supportsNV12Output) { return false; } From 68867488d051f1dc4718d1d819e464d21a61a09b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 Oct 2025 12:20:45 +0100 Subject: [PATCH 16/18] nit --- test/test_decoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_decoders.py b/test/test_decoders.py index 07bec9ac7..098e4e969 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1757,7 +1757,7 @@ def assert_decoder_uses(decoder, *, expected_backend): # Check that the default is the ffmpeg backend assert _get_cuda_backend() == "ffmpeg" dec = VideoDecoder(H265_VIDEO.path, device="cuda") - # assert_decoder_uses(dec, expected_backend="ffmpeg") + assert_decoder_uses(dec, expected_backend="ffmpeg") # Check the setting "beta" effectively uses the BETA backend. # We also show that the affects decoder creation only. When the decoder From 982979b37322703575046dfed69b821dc80a6e20 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 Oct 2025 12:21:10 +0100 Subject: [PATCH 17/18] Nit --- src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index a2e907ace..a5c5e6472 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -54,7 +54,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) { static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { // Decoder creation parameters, most are taken from DALI - CUVIDDECODECREATEINFO decoderParams = {}; decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8; decoderParams.ChromaFormat = videoFormat->chroma_format; From 0c6e98607ed6609b907e1c50824c7b3b5a5223e2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 Oct 2025 12:24:04 +0100 Subject: [PATCH 18/18] consistent names --- src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index a5c5e6472..7124e4309 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -95,7 +95,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{}); } -std::optional mapChromaFormat( +std::optional validateChromaSupport( const AVPixFmtDescriptor* desc) { // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt // otherwise. @@ -165,7 +165,7 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { return false; } - auto chromaFormat = mapChromaFormat(desc); + auto chromaFormat = validateChromaSupport(desc); if (!chromaFormat.has_value()) { return false; }