From 718c268fc6d0a1c158aea6f666f3baa4240b6d54 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Fri, 10 Oct 2025 14:51:34 -0700
Subject: [PATCH 01/18] Refactor receiveFrame and sendPacket logic to dispatch
 directly to interface

---
 .../_core/BetaCudaDeviceInterface.h           |  3 --
 src/torchcodec/_core/DeviceInterface.h        | 50 ++++++++++---------
 src/torchcodec/_core/SingleStreamDecoder.cpp  | 34 ++++---------
 3 files changed, 37 insertions(+), 50 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index 0bf9951d6..8206a7b1b 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -48,9 +48,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
       std::optional<torch::Tensor> preAllocatedOutputTensor =
           std::nullopt) override;
 
-  bool canDecodePacketDirectly() const override {
-    return true;
-  }
 
   int sendPacket(ReferenceAVPacket& packet) override;
   int sendEOFPacket() override;
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
index cac29e838..967484618 100644
--- a/src/torchcodec/_core/DeviceInterface.h
+++ b/src/torchcodec/_core/DeviceInterface.h
@@ -80,42 +80,45 @@ class DeviceInterface {
   // Extension points for custom decoding paths
   // ------------------------------------------
 
-  // Override to return true if this device interface can decode packets
-  // directly. This means that the following two member functions can both
-  // be called:
-  //
-  //   1. sendPacket()
-  //   2. receiveFrame()
-  virtual bool canDecodePacketDirectly() const {
-    return false;
+  // Set the codec context for default FFmpeg decoding operations
+  // This must be called during initialization before using
+  // sendPacket/receiveFrame
+  virtual void setCodecContext(AVCodecContext* codecContext) {
+    codecContext_ = codecContext;
   }
 
-  // Moral equivalent of avcodec_send_packet()
   // Returns AVSUCCESS on success, AVERROR(EAGAIN) if decoder queue full, or
   // other AVERROR on failure
-  virtual int sendPacket([[maybe_unused]] ReferenceAVPacket& avPacket) {
-    TORCH_CHECK(
-        false,
-        "Send/receive packet decoding not implemented for this device interface");
-    return AVERROR(ENOSYS);
+  // Default implementation uses FFmpeg directly
+  virtual int sendPacket(ReferenceAVPacket& avPacket) {
+    if (!codecContext_) {
+      TORCH_CHECK(
+          false, "Codec context not available for default packet sending");
+      return AVERROR(EINVAL);
+    }
+    return avcodec_send_packet(codecContext_, avPacket.get());
   }
 
   // Send an EOF packet to flush the decoder
   // Returns AVSUCCESS on success, or other AVERROR on failure
+  // Default implementation uses FFmpeg directly
   virtual int sendEOFPacket() {
-    TORCH_CHECK(
-        false, "Send EOF packet not implemented for this device interface");
-    return AVERROR(ENOSYS);
+    if (!codecContext_) {
+      TORCH_CHECK(false, "Codec context not available for EOF packet sending");
+      return AVERROR(EINVAL);
+    }
+    return avcodec_send_packet(codecContext_, nullptr);
   }
 
-  // Moral equivalent of avcodec_receive_frame()
   // Returns AVSUCCESS on success, AVERROR(EAGAIN) if no frame ready,
   // AVERROR_EOF if end of stream, or other AVERROR on failure
-  virtual int receiveFrame([[maybe_unused]] UniqueAVFrame& avFrame) {
-    TORCH_CHECK(
-        false,
-        "Send/receive packet decoding not implemented for this device interface");
-    return AVERROR(ENOSYS);
+  // Default implementation uses FFmpeg directly
+  virtual int receiveFrame(UniqueAVFrame& avFrame) {
+    if (!codecContext_) {
+      TORCH_CHECK(false, "Codec context not available for frame receiving");
+      return AVERROR(EINVAL);
+    }
+    return avcodec_receive_frame(codecContext_, avFrame.get());
   }
 
   // Flush remaining frames from decoder
@@ -126,6 +129,7 @@ class DeviceInterface {
 
  protected:
   torch::Device device_;
+  AVCodecContext* codecContext_ = nullptr; // Non-owning pointer
 };
 
 using CreateDeviceInterfaceFn =
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index d06c47922..dfbe72d6f 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -459,6 +459,10 @@ void SingleStreamDecoder::addStream(
 
   codecContext->time_base = streamInfo.stream->time_base;
 
+  // Set the codec context on the device interface for default FFmpeg
+  // implementations
+  deviceInterface_->setCodecContext(codecContext);
+
   containerMetadata_.allStreamMetadata[activeStreamIndex_].codecName =
       std::string(avcodec_get_name(codecContext->codec_id));
 
@@ -1169,24 +1173,16 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
     cursorWasJustSet_ = false;
   }
 
-  StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
   UniqueAVFrame avFrame(av_frame_alloc());
   AutoAVPacket autoAVPacket;
   int status = AVSUCCESS;
   bool reachedEOF = false;
 
-  // TODONVDEC P2: Instead of calling canDecodePacketDirectly() and rely on
-  // if/else blocks to dispatch to the interface or to FFmpeg, consider *always*
-  // dispatching to the interface. The default implementation of the interface's
-  // receiveFrame and sendPacket could just be calling avcodec_receive_frame and
-  // avcodec_send_packet. This would make the decoding loop even more generic.
+  // The default implementation uses avcodec_receive_frame and
+  // avcodec_send_packet, while specialized interfaces can override for
+  // hardware-specific optimizations.
   while (true) {
-    if (deviceInterface_->canDecodePacketDirectly()) {
-      status = deviceInterface_->receiveFrame(avFrame);
-    } else {
-      status =
-          avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
-    }
+    status = deviceInterface_->receiveFrame(avFrame);
 
     if (status != AVSUCCESS && status != AVERROR(EAGAIN)) {
       // Non-retriable error
@@ -1222,13 +1218,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
 
       if (status == AVERROR_EOF) {
         // End of file reached. We must drain the decoder
-        if (deviceInterface_->canDecodePacketDirectly()) {
-          status = deviceInterface_->sendEOFPacket();
-        } else {
-          status = avcodec_send_packet(
-              streamInfo.codecContext.get(),
-              /*avpkt=*/nullptr);
-        }
+        status = deviceInterface_->sendEOFPacket();
         TORCH_CHECK(
             status >= AVSUCCESS,
             "Could not flush decoder: ",
@@ -1253,11 +1243,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
 
     // We got a valid packet. Send it to the decoder, and we'll receive it in
     // the next iteration.
-    if (deviceInterface_->canDecodePacketDirectly()) {
-      status = deviceInterface_->sendPacket(packet);
-    } else {
-      status = avcodec_send_packet(streamInfo.codecContext.get(), packet.get());
-    }
+    status = deviceInterface_->sendPacket(packet);
     TORCH_CHECK(
         status >= AVSUCCESS,
         "Could not push packet to decoder: ",

From 30ee0e88b1dbe75c1e3d7d9f61a3370627b1a947 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Fri, 10 Oct 2025 15:04:10 -0700
Subject: [PATCH 02/18] ran precommit

---
 src/torchcodec/_core/BetaCudaDeviceInterface.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index 8206a7b1b..c59e48140 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -48,7 +48,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
       std::optional<torch::Tensor> preAllocatedOutputTensor =
           std::nullopt) override;
 
-
   int sendPacket(ReferenceAVPacket& packet) override;
   int sendEOFPacket() override;
   int receiveFrame(UniqueAVFrame& avFrame) override;

From 11d7adfb62543ffc799e3acfb416aee164098fd8 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Wed, 15 Oct 2025 08:13:54 -0700
Subject: [PATCH 03/18] use shared_ptr for codecContext

---
 .../_core/BetaCudaDeviceInterface.cpp         |  4 +-
 .../_core/BetaCudaDeviceInterface.h           |  3 +-
 src/torchcodec/_core/CpuDeviceInterface.cpp   |  4 +-
 src/torchcodec/_core/CpuDeviceInterface.h     |  3 +-
 src/torchcodec/_core/CudaDeviceInterface.cpp  |  6 ++-
 src/torchcodec/_core/CudaDeviceInterface.h    |  3 +-
 src/torchcodec/_core/DeviceInterface.h        | 39 ++++++++-----------
 src/torchcodec/_core/FFMPEGCommon.cpp         |  9 +++++
 src/torchcodec/_core/FFMPEGCommon.h           |  3 ++
 src/torchcodec/_core/SingleStreamDecoder.cpp  | 19 +++++----
 src/torchcodec/_core/SingleStreamDecoder.h    |  2 +-
 11 files changed, 54 insertions(+), 41 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 78fa8d635..3cc449d09 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -231,8 +231,10 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
 
 void BetaCudaDeviceInterface::initialize(
     const AVStream* avStream,
-    const UniqueDecodingAVFormatContext& avFormatCtx) {
+    const UniqueDecodingAVFormatContext& avFormatCtx,
+    const SharedAVCodecContext& codecContext) {
   TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
+  codecContext_ = codecContext;
   timeBase_ = avStream->time_base;
   frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
 
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index c59e48140..fb01415d4 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -40,7 +40,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
 
   void initialize(
       const AVStream* avStream,
-      const UniqueDecodingAVFormatContext& avFormatCtx) override;
+      const UniqueDecodingAVFormatContext& avFormatCtx,
+      const SharedAVCodecContext& codecContext) override;
 
   void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
index e6b96e3e4..0e9b46434 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.cpp
+++ b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -48,8 +48,10 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
 
 void CpuDeviceInterface::initialize(
     const AVStream* avStream,
-    [[maybe_unused]] const UniqueDecodingAVFormatContext& avFormatCtx) {
+    [[maybe_unused]] const UniqueDecodingAVFormatContext& avFormatCtx,
+    const SharedAVCodecContext& codecContext) {
   TORCH_CHECK(avStream != nullptr, "avStream is null");
+  codecContext_ = codecContext;
   timeBase_ = avStream->time_base;
 }
 
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
index 399b0c6be..9f44c4e8c 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.h
+++ b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -25,7 +25,8 @@ class CpuDeviceInterface : public DeviceInterface {
 
   virtual void initialize(
       const AVStream* avStream,
-      const UniqueDecodingAVFormatContext& avFormatCtx) override;
+      const UniqueDecodingAVFormatContext& avFormatCtx,
+      const SharedAVCodecContext& codecContext) override;
 
   virtual void initializeVideo(
       const VideoStreamOptions& videoStreamOptions,
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index aea2b2d9a..ba8e495b8 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -117,15 +117,17 @@ CudaDeviceInterface::~CudaDeviceInterface() {
 
 void CudaDeviceInterface::initialize(
     const AVStream* avStream,
-    const UniqueDecodingAVFormatContext& avFormatCtx) {
+    const UniqueDecodingAVFormatContext& avFormatCtx,
+    const SharedAVCodecContext& codecContext) {
   TORCH_CHECK(avStream != nullptr, "avStream is null");
+  codecContext_ = codecContext;
   timeBase_ = avStream->time_base;
 
   // TODO: Ideally, we should keep all interface implementations independent.
   cpuInterface_ = createDeviceInterface(torch::kCPU);
   TORCH_CHECK(
       cpuInterface_ != nullptr, "Failed to create CPU device interface");
-  cpuInterface_->initialize(avStream, avFormatCtx);
+  cpuInterface_->initialize(avStream, avFormatCtx, codecContext);
   cpuInterface_->initializeVideo(
       VideoStreamOptions(),
       {},
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
index 1a8f184ec..d240066f4 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.h
+++ b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -22,7 +22,8 @@ class CudaDeviceInterface : public DeviceInterface {
 
   void initialize(
       const AVStream* avStream,
-      const UniqueDecodingAVFormatContext& avFormatCtx) override;
+      const UniqueDecodingAVFormatContext& avFormatCtx,
+      const SharedAVCodecContext& codecContext) override;
 
   void initializeVideo(
       const VideoStreamOptions& videoStreamOptions,
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
index 967484618..b6c88438e 100644
--- a/src/torchcodec/_core/DeviceInterface.h
+++ b/src/torchcodec/_core/DeviceInterface.h
@@ -54,7 +54,8 @@ class DeviceInterface {
   // Initialize the device with parameters generic to all kinds of decoding.
   virtual void initialize(
       const AVStream* avStream,
-      const UniqueDecodingAVFormatContext& avFormatCtx) = 0;
+      const UniqueDecodingAVFormatContext& avFormatCtx,
+      const SharedAVCodecContext& codecContext) = 0;
 
   // Initialize the device with parameters specific to video decoding. There is
   // a default empty implementation.
@@ -80,23 +81,14 @@ class DeviceInterface {
   // Extension points for custom decoding paths
   // ------------------------------------------
 
-  // Set the codec context for default FFmpeg decoding operations
-  // This must be called during initialization before using
-  // sendPacket/receiveFrame
-  virtual void setCodecContext(AVCodecContext* codecContext) {
-    codecContext_ = codecContext;
-  }
-
   // Returns AVSUCCESS on success, AVERROR(EAGAIN) if decoder queue full, or
   // other AVERROR on failure
   // Default implementation uses FFmpeg directly
   virtual int sendPacket(ReferenceAVPacket& avPacket) {
-    if (!codecContext_) {
-      TORCH_CHECK(
-          false, "Codec context not available for default packet sending");
-      return AVERROR(EINVAL);
-    }
-    return avcodec_send_packet(codecContext_, avPacket.get());
+    TORCH_CHECK(
+        codecContext_ != nullptr,
+        "Codec context not available for default packet sending");
+    return avcodec_send_packet(codecContext_.get(), avPacket.get());
   }
 
   // Send an EOF packet to flush the decoder
@@ -107,29 +99,30 @@ class DeviceInterface {
       TORCH_CHECK(false, "Codec context not available for EOF packet sending");
       return AVERROR(EINVAL);
     }
-    return avcodec_send_packet(codecContext_, nullptr);
+    return avcodec_send_packet(codecContext_.get(), nullptr);
   }
 
   // Returns AVSUCCESS on success, AVERROR(EAGAIN) if no frame ready,
   // AVERROR_EOF if end of stream, or other AVERROR on failure
   // Default implementation uses FFmpeg directly
   virtual int receiveFrame(UniqueAVFrame& avFrame) {
-    if (!codecContext_) {
-      TORCH_CHECK(false, "Codec context not available for frame receiving");
-      return AVERROR(EINVAL);
-    }
-    return avcodec_receive_frame(codecContext_, avFrame.get());
+    TORCH_CHECK(
+        codecContext_ != nullptr,
+        "Codec context not available for default frame receiving");
+    return avcodec_receive_frame(codecContext_.get(), avFrame.get());
   }
 
   // Flush remaining frames from decoder
   virtual void flush() {
-    // Default implementation is no-op for standard decoders
-    // Custom decoders can override this method
+    TORCH_CHECK(
+        codecContext_ != nullptr,
+        "Codec context not available for default flushing");
+    avcodec_flush_buffers(codecContext_.get());
   }
 
  protected:
   torch::Device device_;
-  AVCodecContext* codecContext_ = nullptr; // Non-owning pointer
+  SharedAVCodecContext codecContext_;
 };
 
 using CreateDeviceInterfaceFn =
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
index 0570f06cf..40f918fe6 100644
--- a/src/torchcodec/_core/FFMPEGCommon.cpp
+++ b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -158,6 +158,15 @@ int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
 #endif
 }
 
+int getNumChannels(const SharedAVCodecContext& avCodecContext) {
+#if LIBAVFILTER_VERSION_MAJOR > 8 || \
+    (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
+  return avCodecContext->ch_layout.nb_channels;
+#else
+  return avCodecContext->channels;
+#endif
+}
+
 void setDefaultChannelLayout(
     UniqueAVCodecContext& avCodecContext,
     int numChannels) {
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
index 19cddcc37..5ca1e732c 100644
--- a/src/torchcodec/_core/FFMPEGCommon.h
+++ b/src/torchcodec/_core/FFMPEGCommon.h
@@ -71,6 +71,8 @@ using UniqueEncodingAVFormatContext = std::unique_ptr<
 using UniqueAVCodecContext = std::unique_ptr<
     AVCodecContext,
     Deleterp<AVCodecContext, void, avcodec_free_context>>;
+using SharedAVCodecContext = std::shared_ptr<AVCodecContext>;
+
 using UniqueAVFrame =
     std::unique_ptr<AVFrame, Deleterp<AVFrame, void, av_frame_free>>;
 using UniqueAVFilterGraph = std::unique_ptr<
@@ -172,6 +174,7 @@ const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec);
 
 int getNumChannels(const UniqueAVFrame& avFrame);
 int getNumChannels(const UniqueAVCodecContext& avCodecContext);
+int getNumChannels(const SharedAVCodecContext& avCodecContext);
 
 void setDefaultChannelLayout(
     UniqueAVCodecContext& avCodecContext,
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index dfbe72d6f..0a7b89691 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -429,7 +429,6 @@ void SingleStreamDecoder::addStream(
   TORCH_CHECK(
       deviceInterface_ != nullptr,
       "Failed to create device interface. This should never happen, please report.");
-  deviceInterface_->initialize(streamInfo.stream, formatContext_);
 
   // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within
   // addStream() which is supposed to be generic
@@ -441,7 +440,8 @@ void SingleStreamDecoder::addStream(
 
   AVCodecContext* codecContext = avcodec_alloc_context3(avCodec);
   TORCH_CHECK(codecContext != nullptr);
-  streamInfo.codecContext.reset(codecContext);
+  streamInfo.codecContext = SharedAVCodecContext(
+      codecContext, [](AVCodecContext* ctx) { avcodec_free_context(&ctx); });
 
   int retVal = avcodec_parameters_to_context(
       streamInfo.codecContext.get(), streamInfo.stream->codecpar);
@@ -453,18 +453,19 @@ void SingleStreamDecoder::addStream(
   // Note that we must make sure to register the harware device context
   // with the codec context before calling avcodec_open2(). Otherwise, decoding
   // will happen on the CPU and not the hardware device.
-  deviceInterface_->registerHardwareDeviceWithCodec(codecContext);
+  deviceInterface_->registerHardwareDeviceWithCodec(
+      streamInfo.codecContext.get());
   retVal = avcodec_open2(streamInfo.codecContext.get(), avCodec, nullptr);
   TORCH_CHECK(retVal >= AVSUCCESS, getFFMPEGErrorStringFromErrorCode(retVal));
 
-  codecContext->time_base = streamInfo.stream->time_base;
+  streamInfo.codecContext->time_base = streamInfo.stream->time_base;
 
-  // Set the codec context on the device interface for default FFmpeg
-  // implementations
-  deviceInterface_->setCodecContext(codecContext);
+  // Initialize the device interface with the codec context
+  deviceInterface_->initialize(
+      streamInfo.stream, formatContext_, streamInfo.codecContext);
 
   containerMetadata_.allStreamMetadata[activeStreamIndex_].codecName =
-      std::string(avcodec_get_name(codecContext->codec_id));
+      std::string(avcodec_get_name(streamInfo.codecContext->codec_id));
 
   // We will only need packets from the active stream, so we tell FFmpeg to
   // discard packets from the other streams. Note that av_read_frame() may still
@@ -1153,8 +1154,6 @@ void SingleStreamDecoder::maybeSeekToBeforeDesiredPts() {
       getFFMPEGErrorStringFromErrorCode(status));
 
   decodeStats_.numFlushes++;
-  avcodec_flush_buffers(streamInfo.codecContext.get());
-
   deviceInterface_->flush();
 }
 
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
index 48821ff09..10f820550 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.h
+++ b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -221,7 +221,7 @@ class SingleStreamDecoder {
     AVMediaType avMediaType = AVMEDIA_TYPE_UNKNOWN;
 
     AVRational timeBase = {};
-    UniqueAVCodecContext codecContext;
+    SharedAVCodecContext codecContext;
 
     // The FrameInfo indices we built when scanFileAndUpdateMetadataAndIndex was
     // called.

From 44edb930df6d083893469a681615e8fc9e1a8c93 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Wed, 15 Oct 2025 12:18:07 -0700
Subject: [PATCH 04/18] address feedback

---
 src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 3 +--
 src/torchcodec/_core/DeviceInterface.h           | 7 +++----
 src/torchcodec/_core/FFMPEGCommon.cpp            | 9 ---------
 src/torchcodec/_core/FFMPEGCommon.h              | 1 -
 4 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 3cc449d09..0bdd91a23 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -232,9 +232,8 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
 void BetaCudaDeviceInterface::initialize(
     const AVStream* avStream,
     const UniqueDecodingAVFormatContext& avFormatCtx,
-    const SharedAVCodecContext& codecContext) {
+    [[maybe_unused]] const SharedAVCodecContext& codecContext) {
   TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
-  codecContext_ = codecContext;
   timeBase_ = avStream->time_base;
   frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
 
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
index b6c88438e..25a36a40f 100644
--- a/src/torchcodec/_core/DeviceInterface.h
+++ b/src/torchcodec/_core/DeviceInterface.h
@@ -95,10 +95,9 @@ class DeviceInterface {
   // Returns AVSUCCESS on success, or other AVERROR on failure
   // Default implementation uses FFmpeg directly
   virtual int sendEOFPacket() {
-    if (!codecContext_) {
-      TORCH_CHECK(false, "Codec context not available for EOF packet sending");
-      return AVERROR(EINVAL);
-    }
+    TORCH_CHECK(
+        codecContext_ != nullptr,
+        "Codec context not available for default EOF packet sending");
     return avcodec_send_packet(codecContext_.get(), nullptr);
   }
 
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
index 40f918fe6..97ff082e1 100644
--- a/src/torchcodec/_core/FFMPEGCommon.cpp
+++ b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -149,15 +149,6 @@ int getNumChannels(const UniqueAVFrame& avFrame) {
 #endif
 }
 
-int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
-#if LIBAVFILTER_VERSION_MAJOR > 8 || \
-    (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
-  return avCodecContext->ch_layout.nb_channels;
-#else
-  return avCodecContext->channels;
-#endif
-}
-
 int getNumChannels(const SharedAVCodecContext& avCodecContext) {
 #if LIBAVFILTER_VERSION_MAJOR > 8 || \
     (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
index 5ca1e732c..448333e20 100644
--- a/src/torchcodec/_core/FFMPEGCommon.h
+++ b/src/torchcodec/_core/FFMPEGCommon.h
@@ -173,7 +173,6 @@ const AVSampleFormat* getSupportedOutputSampleFormats(const AVCodec& avCodec);
 const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec);
 
 int getNumChannels(const UniqueAVFrame& avFrame);
-int getNumChannels(const UniqueAVCodecContext& avCodecContext);
 int getNumChannels(const SharedAVCodecContext& avCodecContext);
 
 void setDefaultChannelLayout(

From b09b203eb8bb45c295432252f8d35e78a2926fbd Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Thu, 16 Oct 2025 14:25:24 +0100
Subject: [PATCH 05/18] Disgustingly hacky POC

---
 .../_core/BetaCudaDeviceInterface.cpp         | 214 +++++++++++++-----
 .../_core/BetaCudaDeviceInterface.h           |   6 +
 test/test_decoders.py                         |  13 +-
 3 files changed, 172 insertions(+), 61 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index d55bb1137..6fd5a6ee3 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -52,7 +52,9 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
   return decoder->frameReadyInDisplayOrder(dispInfo);
 }
 
-static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
+static UniqueCUvideodecoder createDecoder(
+    CUVIDEOFORMAT* videoFormat,
+    bool* capabilityCheckFailed = nullptr) {
   // Check decoder capabilities - same checks as DALI
   auto caps = CUVIDDECODECAPS{};
   caps.eCodecType = videoFormat->codec;
@@ -61,65 +63,84 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   CUresult result = cuvidGetDecoderCaps(&caps);
   TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
 
-  TORCH_CHECK(
-      caps.bIsSupported,
-      "Codec configuration not supported on this GPU. "
-      "Codec: ",
-      static_cast<int>(videoFormat->codec),
-      ", chroma format: ",
-      static_cast<int>(videoFormat->chroma_format),
-      ", bit depth: ",
-      videoFormat->bit_depth_luma_minus8 + 8);
-
-  TORCH_CHECK(
-      videoFormat->coded_width >= caps.nMinWidth &&
-          videoFormat->coded_height >= caps.nMinHeight,
-      "Video is too small in at least one dimension. Provided: ",
-      videoFormat->coded_width,
-      "x",
-      videoFormat->coded_height,
-      " vs supported:",
-      caps.nMinWidth,
-      "x",
-      caps.nMinHeight);
+  if (!caps.bIsSupported) {
+    if (capabilityCheckFailed) {
+      *capabilityCheckFailed = true;
+      return nullptr;
+    }
+    TORCH_CHECK(
+        false,
+        "Codec configuration not supported on this GPU. "
+        "Codec: ",
+        static_cast<int>(videoFormat->codec),
+        ", chroma format: ",
+        static_cast<int>(videoFormat->chroma_format),
+        ", bit depth: ",
+        videoFormat->bit_depth_luma_minus8 + 8);
+  }
 
-  TORCH_CHECK(
-      videoFormat->coded_width <= caps.nMaxWidth &&
-          videoFormat->coded_height <= caps.nMaxHeight,
-      "Video is too large in at least one dimension. Provided: ",
-      videoFormat->coded_width,
-      "x",
-      videoFormat->coded_height,
-      " vs supported:",
-      caps.nMaxWidth,
-      "x",
-      caps.nMaxHeight);
+  if (videoFormat->coded_width < caps.nMinWidth ||
+      videoFormat->coded_height < caps.nMinHeight ||
+      videoFormat->coded_width > caps.nMaxWidth ||
+      videoFormat->coded_height > caps.nMaxHeight) {
+    if (capabilityCheckFailed) {
+      *capabilityCheckFailed = true;
+      return nullptr;
+    }
+    TORCH_CHECK(
+        false,
+        "Video dimensions not supported. Provided: ",
+        videoFormat->coded_width,
+        "x",
+        videoFormat->coded_height,
+        " vs supported: ",
+        caps.nMinWidth,
+        "x",
+        caps.nMinHeight,
+        " to ",
+        caps.nMaxWidth,
+        "x",
+        caps.nMaxHeight);
+  }
 
   // See nMaxMBCount in cuviddec.h
   constexpr unsigned int macroblockConstant = 256;
-  TORCH_CHECK(
-      videoFormat->coded_width * videoFormat->coded_height /
-              macroblockConstant <=
-          caps.nMaxMBCount,
-      "Video is too large (too many macroblocks). "
-      "Provided (width * height / ",
-      macroblockConstant,
-      "): ",
-      videoFormat->coded_width * videoFormat->coded_height / macroblockConstant,
-      " vs supported:",
-      caps.nMaxMBCount);
+  if (videoFormat->coded_width * videoFormat->coded_height /
+          macroblockConstant >
+      caps.nMaxMBCount) {
+    if (capabilityCheckFailed) {
+      *capabilityCheckFailed = true;
+      return nullptr;
+    }
+    TORCH_CHECK(
+        false,
+        "Video is too large (too many macroblocks). "
+        "Provided (width * height / ",
+        macroblockConstant,
+        "): ",
+        videoFormat->coded_width * videoFormat->coded_height /
+            macroblockConstant,
+        " vs supported:",
+        caps.nMaxMBCount);
+  }
 
   // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
   // sure it's actually supported.
-  TORCH_CHECK(
-      (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1,
-      "NV12 output format is not supported for this configuration. ",
-      "Codec: ",
-      static_cast<int>(videoFormat->codec),
-      ", chroma format: ",
-      static_cast<int>(videoFormat->chroma_format),
-      ", bit depth: ",
-      videoFormat->bit_depth_luma_minus8 + 8);
+  if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) {
+    if (capabilityCheckFailed) {
+      *capabilityCheckFailed = true;
+      return nullptr;
+    }
+    TORCH_CHECK(
+        false,
+        "NV12 output format is not supported for this configuration. ",
+        "Codec: ",
+        static_cast<int>(videoFormat->codec),
+        ", chroma format: ",
+        static_cast<int>(videoFormat->chroma_format),
+        ", bit depth: ",
+        videoFormat->bit_depth_luma_minus8 + 8);
+  }
 
   // Decoder creation parameters, most are taken from DALI
   CUVIDDECODECREATEINFO decoderParams = {};
@@ -225,6 +246,11 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
     videoParser_ = nullptr;
   }
 
+  // Clean up buffered packet if it wasn't used (commented out to avoid
+  // potential issues) if (bufferedFirstPacket_) {
+  //   av_packet_free(&bufferedFirstPacket_);
+  // }
+
   returnNppStreamContextToCache(device_, std::move(nppCtx_));
 }
 
@@ -239,6 +265,16 @@ void BetaCudaDeviceInterface::initialize(
   const AVCodecParameters* codecPar = avStream->codecpar;
   TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");
 
+  // Initialize CPU interface for potential fallback
+  cpuInterface_ = createDeviceInterface(torch::kCPU);
+  TORCH_CHECK(
+      cpuInterface_ != nullptr, "Failed to create CPU device interface");
+  cpuInterface_->initialize(avStream, avFormatCtx, codecContext);
+  cpuInterface_->initializeVideo(
+      VideoStreamOptions(),
+      {},
+      /*resizedOutputDims=*/std::nullopt);
+
   initializeBSF(codecPar, avFormatCtx);
 
   // Create parser. Default values that aren't obvious are taken from DALI.
@@ -368,7 +404,14 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
       // TODONVDEC P2: consider re-configuring an existing decoder instead of
       // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
       // be enabled in DALI by default.
-      decoder_ = createDecoder(videoFormat);
+      bool capabilityCheckFailed = false;
+      decoder_ = createDecoder(videoFormat, &capabilityCheckFailed);
+
+      if (capabilityCheckFailed) {
+        usingCpuFallback_ = true;
+        capabilityCheckPending_ = false;
+        return static_cast<int>(videoFormat_.min_num_decode_surfaces);
+      }
     }
 
     TORCH_CHECK(decoder_, "Failed to get or create decoder");
@@ -383,10 +426,25 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
 // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
 // the NVCUVID parser.
 int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
+  printf("usingCpuFallback_: %d\n", usingCpuFallback_);
+  if (usingCpuFallback_) {
+    return cpuInterface_->sendPacket(packet);
+  }
+
   TORCH_CHECK(
       packet.get() && packet->data && packet->size > 0,
       "sendPacket received an empty packet, this is unexpected, please report.");
 
+  // On first packet, store a copy before sending to CUDA parser
+  if (capabilityCheckPending_) {
+    // Make a deep copy of the packet before CUDA parser potentially corrupts it
+    bufferedFirstPacket_ = av_packet_alloc();
+    TORCH_CHECK(bufferedFirstPacket_, "Failed to allocate packet for fallback");
+    int ret = av_packet_ref(bufferedFirstPacket_, packet.get());
+    TORCH_CHECK(ret >= 0, "Failed to copy packet for fallback");
+    capabilityCheckPending_ = false;
+  }
+
   // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
   // the original one if no BSF is needed. This new filtered packet must be
   // allocated outside of applyBSF: if it were allocated inside applyBSF, it
@@ -402,10 +460,29 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
   cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
   cuvidPacket.timestamp = packetToSend->pts;
 
-  return sendCuvidPacket(cuvidPacket);
+  int result = sendCuvidPacket(cuvidPacket);
+
+  // If capability check failed and we switched to CPU fallback, send buffered
+  // packet to CPU
+  if (usingCpuFallback_) {
+    printf("Falling back to CPU!!!! And re-sending packet\n");
+    TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet");
+    // Create AutoAVPacket, then ReferenceAVPacket to access get() method
+    AutoAVPacket autoBufferedPacket;
+    ReferenceAVPacket refBufferedPacket(autoBufferedPacket);
+    // Copy the buffered packet data
+    av_packet_ref(refBufferedPacket.get(), bufferedFirstPacket_);
+    return cpuInterface_->sendPacket(refBufferedPacket);
+  }
+
+  return result;
 }
 
 int BetaCudaDeviceInterface::sendEOFPacket() {
+  if (usingCpuFallback_) {
+    return cpuInterface_->sendEOFPacket();
+  }
+
   CUVIDSOURCEDATAPACKET cuvidPacket = {};
   cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
   eofSent_ = true;
@@ -450,6 +527,9 @@ ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF(
 // given frame. It means we can send that frame to be decoded by the hardware
 // NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
 int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
+  if (usingCpuFallback_) {
+    return 1; // success
+  }
   TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
   TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
   // Send frame to be decoded by NVDEC - non-blocking call.
@@ -467,6 +547,10 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
 
 // Moral equivalent of avcodec_receive_frame().
 int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
+  if (usingCpuFallback_) {
+    return cpuInterface_->receiveFrame(avFrame);
+  }
+
   if (readyFrames_.empty()) {
     // No frame found, instruct caller to try again later after sending more
     // packets, or to stop if EOF was already sent.
@@ -601,6 +685,11 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
 }
 
 void BetaCudaDeviceInterface::flush() {
+  if (usingCpuFallback_) {
+    cpuInterface_->flush();
+    return;
+  }
+
   // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
   // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
   // don't say whether this should be an empty packet, or whether it should be a
@@ -618,6 +707,21 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  if (usingCpuFallback_) {
+    // CPU decoded frame - need to do CPU color conversion then transfer to GPU
+    FrameOutput cpuFrameOutput;
+    cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
+
+    // Transfer CPU frame to GPU
+    if (preAllocatedOutputTensor.has_value()) {
+      preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
+      frameOutput.data = preAllocatedOutputTensor.value();
+    } else {
+      frameOutput.data = cpuFrameOutput.data.to(device_);
+    }
+    return;
+  }
+
   // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
   // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
   TORCH_CHECK(
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index fb01415d4..b03814268 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -94,6 +94,12 @@ class BetaCudaDeviceInterface : public DeviceInterface {
 
   // NPP context for color conversion
   UniqueNppContext nppCtx_;
+
+  // CPU fallback support
+  std::unique_ptr<DeviceInterface> cpuInterface_;
+  bool usingCpuFallback_ = false;
+  bool capabilityCheckPending_ = true;
+  AVPacket* bufferedFirstPacket_ = nullptr;
 };
 
 } // namespace facebook::torchcodec
diff --git a/test/test_decoders.py b/test/test_decoders.py
index f9c7d2ff6..52d329e21 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1695,12 +1695,13 @@ def test_beta_cuda_interface_small_h265(self):
         # the ffmpeg interface: this video isn't supported by NVDEC, but in the
         # ffmpeg interface, FFMPEG fallsback to the CPU while we don't.
 
-        VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
-        with pytest.raises(
-            RuntimeError,
-            match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144",
-        ):
-            VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0)
+        print()
+        # VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
+        # with pytest.raises(
+        #     RuntimeError,
+        #     match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144",
+        # ):
+        VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0)
 
     @needs_cuda
     def test_beta_cuda_interface_error(self):

From 82f5807a02b01ab335c73468b433be578f7d78d6 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Thu, 16 Oct 2025 14:48:01 +0100
Subject: [PATCH 06/18] WIP

---
 .../_core/BetaCudaDeviceInterface.cpp         | 70 +++----------------
 test/test_decoders.py                         |  5 +-
 2 files changed, 13 insertions(+), 62 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 6fd5a6ee3..824809ff4 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -54,7 +54,7 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
 
 static UniqueCUvideodecoder createDecoder(
     CUVIDEOFORMAT* videoFormat,
-    bool* capabilityCheckFailed = nullptr) {
+    bool* capabilityCheckFailed) {
   // Check decoder capabilities - same checks as DALI
   auto caps = CUVIDDECODECAPS{};
   caps.eCodecType = videoFormat->codec;
@@ -64,43 +64,16 @@ static UniqueCUvideodecoder createDecoder(
   TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
 
   if (!caps.bIsSupported) {
-    if (capabilityCheckFailed) {
-      *capabilityCheckFailed = true;
-      return nullptr;
-    }
-    TORCH_CHECK(
-        false,
-        "Codec configuration not supported on this GPU. "
-        "Codec: ",
-        static_cast<int>(videoFormat->codec),
-        ", chroma format: ",
-        static_cast<int>(videoFormat->chroma_format),
-        ", bit depth: ",
-        videoFormat->bit_depth_luma_minus8 + 8);
+    *capabilityCheckFailed = true;
+    return nullptr;
   }
 
   if (videoFormat->coded_width < caps.nMinWidth ||
       videoFormat->coded_height < caps.nMinHeight ||
       videoFormat->coded_width > caps.nMaxWidth ||
       videoFormat->coded_height > caps.nMaxHeight) {
-    if (capabilityCheckFailed) {
-      *capabilityCheckFailed = true;
-      return nullptr;
-    }
-    TORCH_CHECK(
-        false,
-        "Video dimensions not supported. Provided: ",
-        videoFormat->coded_width,
-        "x",
-        videoFormat->coded_height,
-        " vs supported: ",
-        caps.nMinWidth,
-        "x",
-        caps.nMinHeight,
-        " to ",
-        caps.nMaxWidth,
-        "x",
-        caps.nMaxHeight);
+    *capabilityCheckFailed = true;
+    return nullptr;
   }
 
   // See nMaxMBCount in cuviddec.h
@@ -108,38 +81,15 @@ static UniqueCUvideodecoder createDecoder(
   if (videoFormat->coded_width * videoFormat->coded_height /
           macroblockConstant >
       caps.nMaxMBCount) {
-    if (capabilityCheckFailed) {
-      *capabilityCheckFailed = true;
-      return nullptr;
-    }
-    TORCH_CHECK(
-        false,
-        "Video is too large (too many macroblocks). "
-        "Provided (width * height / ",
-        macroblockConstant,
-        "): ",
-        videoFormat->coded_width * videoFormat->coded_height /
-            macroblockConstant,
-        " vs supported:",
-        caps.nMaxMBCount);
+    *capabilityCheckFailed = true;
+    return nullptr;
   }
 
   // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
   // sure it's actually supported.
   if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) {
-    if (capabilityCheckFailed) {
-      *capabilityCheckFailed = true;
-      return nullptr;
-    }
-    TORCH_CHECK(
-        false,
-        "NV12 output format is not supported for this configuration. ",
-        "Codec: ",
-        static_cast<int>(videoFormat->codec),
-        ", chroma format: ",
-        static_cast<int>(videoFormat->chroma_format),
-        ", bit depth: ",
-        videoFormat->bit_depth_luma_minus8 + 8);
+    *capabilityCheckFailed = true;
+    return nullptr;
   }
 
   // Decoder creation parameters, most are taken from DALI
@@ -466,7 +416,7 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
   // packet to CPU
   if (usingCpuFallback_) {
     printf("Falling back to CPU!!!! And re-sending packet\n");
-    TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet");
+    // TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet");
     // Create AutoAVPacket, then ReferenceAVPacket to access get() method
     AutoAVPacket autoBufferedPacket;
     ReferenceAVPacket refBufferedPacket(autoBufferedPacket);
diff --git a/test/test_decoders.py b/test/test_decoders.py
index 52d329e21..10e13b0cc 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1696,12 +1696,13 @@ def test_beta_cuda_interface_small_h265(self):
         # ffmpeg interface, FFMPEG fallsback to the CPU while we don't.
 
         print()
-        # VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
+        a = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
         # with pytest.raises(
         #     RuntimeError,
         #     match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144",
         # ):
-        VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0)
+        b = VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0)
+        torch.testing.assert_close(a.data, b.data, rtol=0, atol=0)
 
     @needs_cuda
     def test_beta_cuda_interface_error(self):

From 936de765d3c1ac35e5b903dad6956d6c0821f635 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Thu, 16 Oct 2025 14:49:05 +0100
Subject: [PATCH 07/18] WIP

---
 src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 824809ff4..b4e3529b2 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -360,7 +360,7 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
       if (capabilityCheckFailed) {
         usingCpuFallback_ = true;
         capabilityCheckPending_ = false;
-        return static_cast<int>(videoFormat_.min_num_decode_surfaces);
+        return 0;
       }
     }
 

From c5939d2d2214052a9b6a9a7aee31b64b6fbd50f3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Thu, 16 Oct 2025 17:22:22 +0100
Subject: [PATCH 08/18] WIP

---
 .../_core/BetaCudaDeviceInterface.cpp         | 83 +++++++++----------
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index b4e3529b2..b4cf4de1f 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -52,46 +52,7 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
   return decoder->frameReadyInDisplayOrder(dispInfo);
 }
 
-static UniqueCUvideodecoder createDecoder(
-    CUVIDEOFORMAT* videoFormat,
-    bool* capabilityCheckFailed) {
-  // Check decoder capabilities - same checks as DALI
-  auto caps = CUVIDDECODECAPS{};
-  caps.eCodecType = videoFormat->codec;
-  caps.eChromaFormat = videoFormat->chroma_format;
-  caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
-  CUresult result = cuvidGetDecoderCaps(&caps);
-  TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
-
-  if (!caps.bIsSupported) {
-    *capabilityCheckFailed = true;
-    return nullptr;
-  }
-
-  if (videoFormat->coded_width < caps.nMinWidth ||
-      videoFormat->coded_height < caps.nMinHeight ||
-      videoFormat->coded_width > caps.nMaxWidth ||
-      videoFormat->coded_height > caps.nMaxHeight) {
-    *capabilityCheckFailed = true;
-    return nullptr;
-  }
-
-  // See nMaxMBCount in cuviddec.h
-  constexpr unsigned int macroblockConstant = 256;
-  if (videoFormat->coded_width * videoFormat->coded_height /
-          macroblockConstant >
-      caps.nMaxMBCount) {
-    *capabilityCheckFailed = true;
-    return nullptr;
-  }
-
-  // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
-  // sure it's actually supported.
-  if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) {
-    *capabilityCheckFailed = true;
-    return nullptr;
-  }
-
+static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   // Decoder creation parameters, most are taken from DALI
   CUVIDDECODECREATEINFO decoderParams = {};
   decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
@@ -128,12 +89,42 @@ static UniqueCUvideodecoder createDecoder(
   decoderParams.display_area.bottom = videoFormat->display_area.bottom;
 
   CUvideodecoder* decoder = new CUvideodecoder();
-  result = cuvidCreateDecoder(decoder, &decoderParams);
+  CUresult result = cuvidCreateDecoder(decoder, &decoderParams);
   TORCH_CHECK(
       result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
   return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
 }
 
+bool videoIsSupported(CUVIDEOFORMAT* videoFormat) {
+  // Check decoder capabilities - same checks as DALI
+  auto caps = CUVIDDECODECAPS{};
+  caps.eCodecType = videoFormat->codec;
+  caps.eChromaFormat = videoFormat->chroma_format;
+  caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
+  CUresult result = cuvidGetDecoderCaps(&caps);
+  TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
+
+  if (!caps.bIsSupported) {
+    return false;
+  }
+
+  if (!(videoFormat->coded_width >= caps.nMinWidth &&
+        videoFormat->coded_height >= caps.nMinHeight &&
+        videoFormat->coded_width <= caps.nMaxWidth &&
+        videoFormat->coded_height <= caps.nMaxHeight)) {
+    return false;
+  }
+
+  constexpr unsigned int macroblockConstant = 256;
+  if (!(videoFormat->coded_width * videoFormat->coded_height /
+            macroblockConstant <=
+        caps.nMaxMBCount)) {
+    return false;
+  }
+
+  return true;
+}
+
 cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
   switch (codecId) {
     case AV_CODEC_ID_H264:
@@ -354,14 +345,14 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
       // TODONVDEC P2: consider re-configuring an existing decoder instead of
       // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
       // be enabled in DALI by default.
-      bool capabilityCheckFailed = false;
-      decoder_ = createDecoder(videoFormat, &capabilityCheckFailed);
-
-      if (capabilityCheckFailed) {
+      // Check if NVDEC supports this video configuration
+      if (!videoIsSupported(videoFormat)) {
         usingCpuFallback_ = true;
         capabilityCheckPending_ = false;
-        return 0;
+        return static_cast<int>(videoFormat_.min_num_decode_surfaces);
       }
+
+      decoder_ = createDecoder(videoFormat);
     }
 
     TORCH_CHECK(decoder_, "Failed to get or create decoder");

From 55dffd96dd17199365204955485584fc2e1e8380 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Thu, 16 Oct 2025 17:33:31 +0100
Subject: [PATCH 09/18] WIP

---
 src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 13 +++++++------
 src/torchcodec/_core/BetaCudaDeviceInterface.h   |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index b4cf4de1f..e8bdc7bd7 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -54,6 +54,8 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
 
 static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   // Decoder creation parameters, most are taken from DALI
+  // Callers should ensure video is supported by calling videoIsSupported() first.
+
   CUVIDDECODECREATEINFO decoderParams = {};
   decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
   decoderParams.ChromaFormat = videoFormat->chroma_format;
@@ -345,11 +347,10 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
       // TODONVDEC P2: consider re-configuring an existing decoder instead of
       // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
       // be enabled in DALI by default.
-      // Check if NVDEC supports this video configuration
+      
       if (!videoIsSupported(videoFormat)) {
         usingCpuFallback_ = true;
-        capabilityCheckPending_ = false;
-        return static_cast<int>(videoFormat_.min_num_decode_surfaces);
+        return 0;
       }
 
       decoder_ = createDecoder(videoFormat);
@@ -377,13 +378,13 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
       "sendPacket received an empty packet, this is unexpected, please report.");
 
   // On first packet, store a copy before sending to CUDA parser
-  if (capabilityCheckPending_) {
+  if (isFirstPacket_) {
     // Make a deep copy of the packet before CUDA parser potentially corrupts it
     bufferedFirstPacket_ = av_packet_alloc();
     TORCH_CHECK(bufferedFirstPacket_, "Failed to allocate packet for fallback");
     int ret = av_packet_ref(bufferedFirstPacket_, packet.get());
     TORCH_CHECK(ret >= 0, "Failed to copy packet for fallback");
-    capabilityCheckPending_ = false;
+    isFirstPacket_ = false;
   }
 
   // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
@@ -407,7 +408,7 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
   // packet to CPU
   if (usingCpuFallback_) {
     printf("Falling back to CPU!!!! And re-sending packet\n");
-    // TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet");
+    // // TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet");
     // Create AutoAVPacket, then ReferenceAVPacket to access get() method
     AutoAVPacket autoBufferedPacket;
     ReferenceAVPacket refBufferedPacket(autoBufferedPacket);
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index b03814268..5b3f7523c 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -98,7 +98,7 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   // CPU fallback support
   std::unique_ptr<DeviceInterface> cpuInterface_;
   bool usingCpuFallback_ = false;
-  bool capabilityCheckPending_ = true;
+  bool isFirstPacket_ = true;
   AVPacket* bufferedFirstPacket_ = nullptr;
 };
 

From e58746a1d26e078a884c4b380b912806110b6a01 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Thu, 16 Oct 2025 19:17:50 +0100
Subject: [PATCH 10/18] Init checks - yummy

---
 .../_core/BetaCudaDeviceInterface.cpp         | 184 +++++++++++-------
 .../_core/BetaCudaDeviceInterface.h           |   6 +-
 2 files changed, 119 insertions(+), 71 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index e8bdc7bd7..d45ba6671 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -54,7 +54,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
 
 static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   // Decoder creation parameters, most are taken from DALI
-  // Callers should ensure video is supported by calling videoIsSupported() first.
 
   CUVIDDECODECREATEINFO decoderParams = {};
   decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
@@ -127,7 +126,34 @@ bool videoIsSupported(CUVIDEOFORMAT* videoFormat) {
   return true;
 }
 
-cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
+std::optional<cudaVideoChromaFormat> mapChromaFormat(
+    const AVPixFmtDescriptor* desc) {
+  if (!desc) {
+    return std::nullopt;
+  }
+
+  if (desc->nb_components == 1) {
+    return cudaVideoChromaFormat_Monochrome;
+  }
+
+  // Check if it's YUV (has chroma planes and not RGB)
+  if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
+    if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) {
+      // 4:4:4 (no subsampling)
+      return cudaVideoChromaFormat_444;
+    } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) {
+      // 4:2:0 (2x2 subsampling)
+      return cudaVideoChromaFormat_420;
+    } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) {
+      // 4:2:2 (2x1 subsampling)
+      return cudaVideoChromaFormat_422;
+    }
+  }
+
+  return std::nullopt;
+}
+
+std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
   switch (codecId) {
     case AV_CODEC_ID_H264:
       return cudaVideoCodec_H264;
@@ -153,10 +179,68 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
     //   return cudaVideoCodec_JPEG;
     // case AV_CODEC_ID_VC1:
     //   return cudaVideoCodec_VC1;
-    default: {
-      TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
-    }
+    default:
+      return std::nullopt;
+  }
+}
+
+bool shouldFallbackToCPU(const SharedAVCodecContext& codecContext) {
+  auto codecType = validateCodecSupport(codecContext->codec_id);
+  if (!codecType.has_value()) {
+    return true;
+  }
+
+  const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt);
+  if (!desc) {
+    return true;
+  }
+
+  auto chromaFormat = mapChromaFormat(desc);
+  if (!chromaFormat.has_value()) {
+    return true;
+  }
+
+  auto caps = CUVIDDECODECAPS{};
+  caps.eCodecType = codecType.value();
+  caps.eChromaFormat = chromaFormat.value();
+  caps.nBitDepthMinus8 = desc->comp[0].depth - 8;
+
+  CUresult result = cuvidGetDecoderCaps(&caps);
+  if (result != CUDA_SUCCESS) {
+    return true;
+  }
+
+  if (!caps.bIsSupported) {
+    return true;
+  }
+
+  if (!(static_cast<unsigned int>(codecContext->coded_width) >=
+            caps.nMinWidth &&
+        static_cast<unsigned int>(codecContext->coded_height) >=
+            caps.nMinHeight &&
+        static_cast<unsigned int>(codecContext->coded_width) <=
+            caps.nMaxWidth &&
+        static_cast<unsigned int>(codecContext->coded_height) <=
+            caps.nMaxHeight)) {
+    return true;
   }
+
+  // See nMaxMBCount in cuviddec.h
+  constexpr unsigned int macroblockConstant = 256;
+  if (!(static_cast<unsigned int>(
+            codecContext->coded_width * codecContext->coded_height) /
+            macroblockConstant <=
+        caps.nMaxMBCount)) {
+    return true;
+  }
+
+  // We explicitly request NV12 output format in createDecoder(), so we need to
+  // make sure it's supported.
+  if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) {
+    return true;
+  }
+
+  return false;
 }
 
 } // namespace
@@ -189,11 +273,6 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
     videoParser_ = nullptr;
   }
 
-  // Clean up buffered packet if it wasn't used (commented out to avoid
-  // potential issues) if (bufferedFirstPacket_) {
-  //   av_packet_free(&bufferedFirstPacket_);
-  // }
-
   returnNppStreamContextToCache(device_, std::move(nppCtx_));
 }
 
@@ -201,6 +280,19 @@ void BetaCudaDeviceInterface::initialize(
     const AVStream* avStream,
     const UniqueDecodingAVFormatContext& avFormatCtx,
     [[maybe_unused]] const SharedAVCodecContext& codecContext) {
+  if (shouldFallbackToCPU(codecContext)) {
+    cpuFallback_ = createDeviceInterface(torch::kCPU);
+    TORCH_CHECK(
+        cpuFallback_ != nullptr, "Failed to create CPU device interface");
+    cpuFallback_->initialize(avStream, avFormatCtx, codecContext);
+    cpuFallback_->initializeVideo(
+        VideoStreamOptions(),
+        {},
+        /*resizedOutputDims=*/std::nullopt);
+    // We'll always use the CPU fallback from now on, so we can return early.
+    return;
+  }
+
   TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
   timeBase_ = avStream->time_base;
   frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
@@ -208,21 +300,15 @@ void BetaCudaDeviceInterface::initialize(
   const AVCodecParameters* codecPar = avStream->codecpar;
   TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");
 
-  // Initialize CPU interface for potential fallback
-  cpuInterface_ = createDeviceInterface(torch::kCPU);
-  TORCH_CHECK(
-      cpuInterface_ != nullptr, "Failed to create CPU device interface");
-  cpuInterface_->initialize(avStream, avFormatCtx, codecContext);
-  cpuInterface_->initializeVideo(
-      VideoStreamOptions(),
-      {},
-      /*resizedOutputDims=*/std::nullopt);
-
   initializeBSF(codecPar, avFormatCtx);
 
   // Create parser. Default values that aren't obvious are taken from DALI.
   CUVIDPARSERPARAMS parserParams = {};
-  parserParams.CodecType = validateCodecSupport(codecPar->codec_id);
+  auto codecType = validateCodecSupport(codecPar->codec_id);
+  TORCH_CHECK(
+      codecType.has_value(),
+      "This should never happen, we should be using the CPU fallback by now. Please report a bug.");
+  parserParams.CodecType = codecType.value();
   parserParams.ulMaxNumDecodeSurfaces = 8;
   parserParams.ulMaxDisplayDelay = 0;
   // Callback setup, all are triggered by the parser within a call
@@ -347,11 +433,6 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
       // TODONVDEC P2: consider re-configuring an existing decoder instead of
       // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
       // be enabled in DALI by default.
-      
-      if (!videoIsSupported(videoFormat)) {
-        usingCpuFallback_ = true;
-        return 0;
-      }
 
       decoder_ = createDecoder(videoFormat);
     }
@@ -368,25 +449,14 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
 // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
 // the NVCUVID parser.
 int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
-  printf("usingCpuFallback_: %d\n", usingCpuFallback_);
-  if (usingCpuFallback_) {
-    return cpuInterface_->sendPacket(packet);
+  if (cpuFallback_) {
+    return cpuFallback_->sendPacket(packet);
   }
 
   TORCH_CHECK(
       packet.get() && packet->data && packet->size > 0,
       "sendPacket received an empty packet, this is unexpected, please report.");
 
-  // On first packet, store a copy before sending to CUDA parser
-  if (isFirstPacket_) {
-    // Make a deep copy of the packet before CUDA parser potentially corrupts it
-    bufferedFirstPacket_ = av_packet_alloc();
-    TORCH_CHECK(bufferedFirstPacket_, "Failed to allocate packet for fallback");
-    int ret = av_packet_ref(bufferedFirstPacket_, packet.get());
-    TORCH_CHECK(ret >= 0, "Failed to copy packet for fallback");
-    isFirstPacket_ = false;
-  }
-
   // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
   // the original one if no BSF is needed. This new filtered packet must be
   // allocated outside of applyBSF: if it were allocated inside applyBSF, it
@@ -402,27 +472,12 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
   cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
   cuvidPacket.timestamp = packetToSend->pts;
 
-  int result = sendCuvidPacket(cuvidPacket);
-
-  // If capability check failed and we switched to CPU fallback, send buffered
-  // packet to CPU
-  if (usingCpuFallback_) {
-    printf("Falling back to CPU!!!! And re-sending packet\n");
-    // // TORCH_CHECK(false, "Falling back to CPU!!!! And re-sending packet");
-    // Create AutoAVPacket, then ReferenceAVPacket to access get() method
-    AutoAVPacket autoBufferedPacket;
-    ReferenceAVPacket refBufferedPacket(autoBufferedPacket);
-    // Copy the buffered packet data
-    av_packet_ref(refBufferedPacket.get(), bufferedFirstPacket_);
-    return cpuInterface_->sendPacket(refBufferedPacket);
-  }
-
-  return result;
+  return sendCuvidPacket(cuvidPacket);
 }
 
 int BetaCudaDeviceInterface::sendEOFPacket() {
-  if (usingCpuFallback_) {
-    return cpuInterface_->sendEOFPacket();
+  if (cpuFallback_) {
+    return cpuFallback_->sendEOFPacket();
   }
 
   CUVIDSOURCEDATAPACKET cuvidPacket = {};
@@ -469,9 +524,6 @@ ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF(
 // given frame. It means we can send that frame to be decoded by the hardware
 // NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
 int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
-  if (usingCpuFallback_) {
-    return 1; // success
-  }
   TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
   TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
   // Send frame to be decoded by NVDEC - non-blocking call.
@@ -489,8 +541,8 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
 
 // Moral equivalent of avcodec_receive_frame().
 int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
-  if (usingCpuFallback_) {
-    return cpuInterface_->receiveFrame(avFrame);
+  if (cpuFallback_) {
+    return cpuFallback_->receiveFrame(avFrame);
   }
 
   if (readyFrames_.empty()) {
@@ -627,8 +679,8 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
 }
 
 void BetaCudaDeviceInterface::flush() {
-  if (usingCpuFallback_) {
-    cpuInterface_->flush();
+  if (cpuFallback_) {
+    cpuFallback_->flush();
     return;
   }
 
@@ -649,10 +701,10 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  if (usingCpuFallback_) {
+  if (cpuFallback_) {
     // CPU decoded frame - need to do CPU color conversion then transfer to GPU
     FrameOutput cpuFrameOutput;
-    cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
+    cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
 
     // Transfer CPU frame to GPU
     if (preAllocatedOutputTensor.has_value()) {
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index 5b3f7523c..7424a877d 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -95,11 +95,7 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   // NPP context for color conversion
   UniqueNppContext nppCtx_;
 
-  // CPU fallback support
-  std::unique_ptr<DeviceInterface> cpuInterface_;
-  bool usingCpuFallback_ = false;
-  bool isFirstPacket_ = true;
-  AVPacket* bufferedFirstPacket_ = nullptr;
+  std::unique_ptr<DeviceInterface> cpuFallback_;
 };
 
 } // namespace facebook::torchcodec

From 8373eec7bc740f391804897296d6d00d6accbadc Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Thu, 16 Oct 2025 19:20:14 +0100
Subject: [PATCH 11/18] WIP

---
 .../_core/BetaCudaDeviceInterface.cpp         | 30 -------------------
 1 file changed, 30 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index d45ba6671..e36a47925 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -96,36 +96,6 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
 }
 
-bool videoIsSupported(CUVIDEOFORMAT* videoFormat) {
-  // Check decoder capabilities - same checks as DALI
-  auto caps = CUVIDDECODECAPS{};
-  caps.eCodecType = videoFormat->codec;
-  caps.eChromaFormat = videoFormat->chroma_format;
-  caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
-  CUresult result = cuvidGetDecoderCaps(&caps);
-  TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
-
-  if (!caps.bIsSupported) {
-    return false;
-  }
-
-  if (!(videoFormat->coded_width >= caps.nMinWidth &&
-        videoFormat->coded_height >= caps.nMinHeight &&
-        videoFormat->coded_width <= caps.nMaxWidth &&
-        videoFormat->coded_height <= caps.nMaxHeight)) {
-    return false;
-  }
-
-  constexpr unsigned int macroblockConstant = 256;
-  if (!(videoFormat->coded_width * videoFormat->coded_height /
-            macroblockConstant <=
-        caps.nMaxMBCount)) {
-    return false;
-  }
-
-  return true;
-}
-
 std::optional<cudaVideoChromaFormat> mapChromaFormat(
     const AVPixFmtDescriptor* desc) {
   if (!desc) {

From 4178e2350832fe48820cabd75de0d43b8cf32e62 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Thu, 16 Oct 2025 19:37:51 +0100
Subject: [PATCH 12/18] WIP

---
 .../_core/BetaCudaDeviceInterface.cpp         | 42 ++++++++-----------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index e36a47925..17e78abda 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -98,25 +98,18 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
 
 std::optional<cudaVideoChromaFormat> mapChromaFormat(
     const AVPixFmtDescriptor* desc) {
-  if (!desc) {
-    return std::nullopt;
-  }
+  TORCH_CHECK(desc != nullptr, "desc can't be null");
 
   if (desc->nb_components == 1) {
     return cudaVideoChromaFormat_Monochrome;
-  }
-
-  // Check if it's YUV (has chroma planes and not RGB)
-  if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
+  } else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
+    // Make sure it's YUV: has chroma planes and isn't RGB
     if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) {
-      // 4:4:4 (no subsampling)
-      return cudaVideoChromaFormat_444;
+      return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4
     } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) {
-      // 4:2:0 (2x2 subsampling)
-      return cudaVideoChromaFormat_420;
+      return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0
     } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) {
-      // 4:2:2 (2x1 subsampling)
-      return cudaVideoChromaFormat_422;
+      return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2
     }
   }
 
@@ -154,20 +147,20 @@ std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
   }
 }
 
-bool shouldFallbackToCPU(const SharedAVCodecContext& codecContext) {
+bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
   auto codecType = validateCodecSupport(codecContext->codec_id);
   if (!codecType.has_value()) {
-    return true;
+    return false;
   }
 
   const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt);
   if (!desc) {
-    return true;
+    return false;
   }
 
   auto chromaFormat = mapChromaFormat(desc);
   if (!chromaFormat.has_value()) {
-    return true;
+    return false;
   }
 
   auto caps = CUVIDDECODECAPS{};
@@ -177,11 +170,11 @@ bool shouldFallbackToCPU(const SharedAVCodecContext& codecContext) {
 
   CUresult result = cuvidGetDecoderCaps(&caps);
   if (result != CUDA_SUCCESS) {
-    return true;
+    return false;
   }
 
   if (!caps.bIsSupported) {
-    return true;
+    return false;
   }
 
   if (!(static_cast<unsigned int>(codecContext->coded_width) >=
@@ -192,7 +185,7 @@ bool shouldFallbackToCPU(const SharedAVCodecContext& codecContext) {
             caps.nMaxWidth &&
         static_cast<unsigned int>(codecContext->coded_height) <=
             caps.nMaxHeight)) {
-    return true;
+    return false;
   }
 
   // See nMaxMBCount in cuviddec.h
@@ -201,16 +194,16 @@ bool shouldFallbackToCPU(const SharedAVCodecContext& codecContext) {
             codecContext->coded_width * codecContext->coded_height) /
             macroblockConstant <=
         caps.nMaxMBCount)) {
-    return true;
+    return false;
   }
 
   // We explicitly request NV12 output format in createDecoder(), so we need to
   // make sure it's supported.
   if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) {
-    return true;
+    return false;
   }
 
-  return false;
+  return true;
 }
 
 } // namespace
@@ -250,7 +243,7 @@ void BetaCudaDeviceInterface::initialize(
     const AVStream* avStream,
     const UniqueDecodingAVFormatContext& avFormatCtx,
     [[maybe_unused]] const SharedAVCodecContext& codecContext) {
-  if (shouldFallbackToCPU(codecContext)) {
+  if (!nativeNVDECSupport(codecContext)) {
     cpuFallback_ = createDeviceInterface(torch::kCPU);
     TORCH_CHECK(
         cpuFallback_ != nullptr, "Failed to create CPU device interface");
@@ -403,7 +396,6 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
       // TODONVDEC P2: consider re-configuring an existing decoder instead of
       // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
       // be enabled in DALI by default.
-
       decoder_ = createDecoder(videoFormat);
     }
 

From 234e1d99db810311bfdb375f450f92804d7fbd74 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Fri, 17 Oct 2025 11:14:35 +0100
Subject: [PATCH 13/18] Fix merge

---
 src/torchcodec/_core/FFMPEGCommon.h          | 6 ++++++
 src/torchcodec/_core/SingleStreamDecoder.cpp | 3 +--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
index 448333e20..337616ddc 100644
--- a/src/torchcodec/_core/FFMPEGCommon.h
+++ b/src/torchcodec/_core/FFMPEGCommon.h
@@ -73,6 +73,12 @@ using UniqueAVCodecContext = std::unique_ptr<
     Deleterp<AVCodecContext, void, avcodec_free_context>>;
 using SharedAVCodecContext = std::shared_ptr<AVCodecContext>;
 
+// create SharedAVCodecContext with custom deleter
+inline SharedAVCodecContext makeSharedAVCodecContext(AVCodecContext* ctx) {
+  return SharedAVCodecContext(
+      ctx, Deleterp<AVCodecContext, void, avcodec_free_context>{});
+}
+
 using UniqueAVFrame =
     std::unique_ptr<AVFrame, Deleterp<AVFrame, void, av_frame_free>>;
 using UniqueAVFilterGraph = std::unique_ptr<
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index 0a7b89691..ba7382c67 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -440,8 +440,7 @@ void SingleStreamDecoder::addStream(
 
   AVCodecContext* codecContext = avcodec_alloc_context3(avCodec);
   TORCH_CHECK(codecContext != nullptr);
-  streamInfo.codecContext = SharedAVCodecContext(
-      codecContext, [](AVCodecContext* ctx) { avcodec_free_context(&ctx); });
+  streamInfo.codecContext = makeSharedAVCodecContext(codecContext);
 
   int retVal = avcodec_parameters_to_context(
       streamInfo.codecContext.get(), streamInfo.stream->codecpar);

From 82db4350748851f98e45f3b760972130362604b7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Fri, 17 Oct 2025 11:41:18 +0100
Subject: [PATCH 14/18] WIP

---
 .../_core/BetaCudaDeviceInterface.cpp         | 33 ++++++++------
 test/test_decoders.py                         | 43 +++++++++++--------
 2 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 17e78abda..747e5e82a 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -98,6 +98,8 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
 
 std::optional<cudaVideoChromaFormat> mapChromaFormat(
     const AVPixFmtDescriptor* desc) {
+  // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt
+  // otherwise.
   TORCH_CHECK(desc != nullptr, "desc can't be null");
 
   if (desc->nb_components == 1) {
@@ -117,6 +119,10 @@ std::optional<cudaVideoChromaFormat> mapChromaFormat(
 }
 
 std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
+  // Return the corresponding cudaVideoCodec if supported, std::nullopt
+  // otherwise
+  // Note that we currently return nullopt (and thus fallback to CPU) for some
+  // codecs that are technically supported by NVDEC, see comment below.
   switch (codecId) {
     case AV_CODEC_ID_H264:
       return cudaVideoCodec_H264;
@@ -148,6 +154,8 @@ std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
 }
 
 bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
+  // Return true iff the input video stream is supported by our NVDEC
+  // implementation.
   auto codecType = validateCodecSupport(codecContext->codec_id);
   if (!codecType.has_value()) {
     return false;
@@ -177,28 +185,25 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
     return false;
   }
 
-  if (!(static_cast<unsigned int>(codecContext->coded_width) >=
-            caps.nMinWidth &&
-        static_cast<unsigned int>(codecContext->coded_height) >=
-            caps.nMinHeight &&
-        static_cast<unsigned int>(codecContext->coded_width) <=
-            caps.nMaxWidth &&
-        static_cast<unsigned int>(codecContext->coded_height) <=
-            caps.nMaxHeight)) {
+  auto coded_width = static_cast<unsigned int>(codecContext->coded_width);
+  auto coded_height = static_cast<unsigned int>(codecContext->coded_height);
+  if (!(coded_width >= static_cast<unsigned int>(caps.nMinWidth) &&
+        coded_height >= static_cast<unsigned int>(caps.nMinHeight) &&
+        coded_width <= caps.nMaxWidth && coded_height <= caps.nMaxHeight)) {
     return false;
   }
 
   // See nMaxMBCount in cuviddec.h
   constexpr unsigned int macroblockConstant = 256;
-  if (!(static_cast<unsigned int>(
-            codecContext->coded_width * codecContext->coded_height) /
-            macroblockConstant <=
-        caps.nMaxMBCount)) {
+  if (!(coded_width * coded_height / macroblockConstant <= caps.nMaxMBCount)) {
     return false;
   }
 
-  // We explicitly request NV12 output format in createDecoder(), so we need to
-  // make sure it's supported.
+  // We'll set the decoderParams.OutputFormat to NV12, so we need to make
+  // sure it's actually supported.
+  // TODO: If this fail, we could consider decoding to something else than NV12
+  // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is
+  // what FFmpeg does.
   if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) {
     return false;
   }
diff --git a/test/test_decoders.py b/test/test_decoders.py
index 873fe68a4..07bec9ac7 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1701,19 +1701,19 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode):
             assert beta_frame.duration_seconds == ref_frame.duration_seconds
 
     @needs_cuda
-    def test_beta_cuda_interface_small_h265(self):
-        # Test to illustrate current difference in behavior between the BETA and
-        # the ffmpeg interface: this video isn't supported by NVDEC, but in the
-        # ffmpeg interface, FFMPEG fallsback to the CPU while we don't.
-
-        print()
-        a = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
-        # with pytest.raises(
-        #     RuntimeError,
-        #     match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144",
-        # ):
-        b = VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0)
-        torch.testing.assert_close(a.data, b.data, rtol=0, atol=0)
+    def test_beta_cuda_interface_cpu_fallback(self):
+        # Non-regression test for the CPU fallback behavior of the BETA CUDA
+        # interface.
+        # We know that the H265_VIDEO asset isn't supported by NVDEC, its
+        # dimensions are too small. We also know that the FFmpeg CUDA interface
+        # fallbacks to the CPU path in such cases. We assert that we fall back
+        # to the CPU path, too.
+
+        ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
+        with set_cuda_backend("beta"):
+            beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
+
+        torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0)
 
     @needs_cuda
     def test_beta_cuda_interface_error(self):
@@ -1739,20 +1739,25 @@ def test_set_cuda_backend(self):
             assert _get_cuda_backend() == "beta"
 
         def assert_decoder_uses(decoder, *, expected_backend):
+            # TODO: This doesn't work anymore after
+            # https://github.com/meta-pytorch/torchcodec/pull/977
+            # We need to define a better way to assert which backend a decoder
+            # is using.
+            return
             # Assert that a decoder instance is using a given backend.
             #
             # We know H265_VIDEO fails on the BETA backend while it works on the
             # ffmpeg one.
-            if expected_backend == "ffmpeg":
-                decoder.get_frame_at(0)  # this would fail if this was BETA
-            else:
-                with pytest.raises(RuntimeError, match="Video is too small"):
-                    decoder.get_frame_at(0)
+            # if expected_backend == "ffmpeg":
+            #     decoder.get_frame_at(0)  # this would fail if this was BETA
+            # else:
+            #     with pytest.raises(RuntimeError, match="Video is too small"):
+            #         decoder.get_frame_at(0)
 
         # Check that the default is the ffmpeg backend
         assert _get_cuda_backend() == "ffmpeg"
         dec = VideoDecoder(H265_VIDEO.path, device="cuda")
-        assert_decoder_uses(dec, expected_backend="ffmpeg")
+        # assert_decoder_uses(dec, expected_backend="ffmpeg")
 
         # Check the setting "beta" effectively uses the BETA backend.
         # We also show that the affects decoder creation only. When the decoder

From f96264baf72c41e382f098a7a8676f6eb2d48bc4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Fri, 17 Oct 2025 12:18:10 +0100
Subject: [PATCH 15/18] slightly simplify

---
 src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 747e5e82a..a2e907ace 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -187,15 +187,15 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
 
   auto coded_width = static_cast<unsigned int>(codecContext->coded_width);
   auto coded_height = static_cast<unsigned int>(codecContext->coded_height);
-  if (!(coded_width >= static_cast<unsigned int>(caps.nMinWidth) &&
-        coded_height >= static_cast<unsigned int>(caps.nMinHeight) &&
-        coded_width <= caps.nMaxWidth && coded_height <= caps.nMaxHeight)) {
+  if (coded_width < static_cast<unsigned int>(caps.nMinWidth) ||
+      coded_height < static_cast<unsigned int>(caps.nMinHeight) ||
+      coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight) {
     return false;
   }
 
   // See nMaxMBCount in cuviddec.h
   constexpr unsigned int macroblockConstant = 256;
-  if (!(coded_width * coded_height / macroblockConstant <= caps.nMaxMBCount)) {
+  if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount) {
     return false;
   }
 
@@ -204,7 +204,9 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
   // TODO: If this fail, we could consider decoding to something else than NV12
   // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is
   // what FFmpeg does.
-  if (!((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) {
+  bool supportsNV12Output =
+      (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1;
+  if (!supportsNV12Output) {
     return false;
   }
 

From 68867488d051f1dc4718d1d819e464d21a61a09b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Fri, 17 Oct 2025 12:20:45 +0100
Subject: [PATCH 16/18] nit

---
 test/test_decoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_decoders.py b/test/test_decoders.py
index 07bec9ac7..098e4e969 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1757,7 +1757,7 @@ def assert_decoder_uses(decoder, *, expected_backend):
         # Check that the default is the ffmpeg backend
         assert _get_cuda_backend() == "ffmpeg"
         dec = VideoDecoder(H265_VIDEO.path, device="cuda")
-        # assert_decoder_uses(dec, expected_backend="ffmpeg")
+        assert_decoder_uses(dec, expected_backend="ffmpeg")
 
         # Check the setting "beta" effectively uses the BETA backend.
         # We also show that the affects decoder creation only. When the decoder

From 982979b37322703575046dfed69b821dc80a6e20 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Fri, 17 Oct 2025 12:21:10 +0100
Subject: [PATCH 17/18] Nit

---
 src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index a2e907ace..a5c5e6472 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -54,7 +54,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
 
 static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   // Decoder creation parameters, most are taken from DALI
-
   CUVIDDECODECREATEINFO decoderParams = {};
   decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
   decoderParams.ChromaFormat = videoFormat->chroma_format;

From 0c6e98607ed6609b907e1c50824c7b3b5a5223e2 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Fri, 17 Oct 2025 12:24:04 +0100
Subject: [PATCH 18/18] consistent names

---
 src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index a5c5e6472..7124e4309 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -95,7 +95,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
 }
 
-std::optional<cudaVideoChromaFormat> mapChromaFormat(
+std::optional<cudaVideoChromaFormat> validateChromaSupport(
     const AVPixFmtDescriptor* desc) {
   // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt
   // otherwise.
@@ -165,7 +165,7 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
     return false;
   }
 
-  auto chromaFormat = mapChromaFormat(desc);
+  auto chromaFormat = validateChromaSupport(desc);
   if (!chromaFormat.has_value()) {
     return false;
   }