meta-pytorch · NicolasHug · Oct 2, 2025 · Sep 25, 2025 · Sep 26, 2025 · Sep 30, 2025
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -0,0 +1,108 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+// BETA CUDA device interface that provides direct control over NVDEC
+// while keeping FFmpeg for demuxing. A lot of the logic, particularly the use
+// of a cache for the decoders, is inspired by DALI's implementation which is
+// APACHE 2.0:
+// https://github.com/NVIDIA/DALI/blob/c7539676a24a8e9e99a6e8665e277363c5445259/dali/operators/video/frames_decoder_gpu.cc#L1
+//
+// NVDEC / NVCUVID docs:
+// https://docs.nvidia.com/video-technologies/video-codec-sdk/13.0/nvdec-video-decoder-api-prog-guide/index.html#using-nvidia-video-decoder-nvdecode-api
+
+#pragma once
+
+#include "src/torchcodec/_core/Cache.h"
+#include "src/torchcodec/_core/DeviceInterface.h"
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/NVDECCache.h"
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <unordered_map>
+#include <vector>
+
+#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
+#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
+
+namespace facebook::torchcodec {
+
+class BetaCudaDeviceInterface : public DeviceInterface {
+ public:
+  explicit BetaCudaDeviceInterface(const torch::Device& device);
+  virtual ~BetaCudaDeviceInterface();
+
+  void initializeInterface(AVStream* stream) override;
+
+  void convertAVFrameToFrameOutput(
+      const VideoStreamOptions& videoStreamOptions,
+      const AVRational& timeBase,
+      UniqueAVFrame& avFrame,
+      FrameOutput& frameOutput,
+      std::optional<torch::Tensor> preAllocatedOutputTensor =
+          std::nullopt) override;
+
+  bool canDecodePacketDirectly() const override {
+    return true;
+  }
+
+  int sendPacket(ReferenceAVPacket& packet) override;
+  int receiveFrame(UniqueAVFrame& avFrame, int64_t desiredPts) override;
+  void flush() override;
+  ReferenceAVPacket* applyBSF(
+      ReferenceAVPacket& packet,
+      AutoAVPacket& filteredAutoPacket,
+      ReferenceAVPacket& filteredPacket) override;
+
+  // NVDEC callback functions (must be public for C callbacks)
+  unsigned char streamPropertyChange(CUVIDEOFORMAT* videoFormat);
+  int frameReadyForDecoding(CUVIDPICPARAMS* pPicParams);
+
+ private:
+  UniqueAVFrame convertCudaFrameToAVFrame(
+      CUdeviceptr framePtr,
+      unsigned int pitch,
+      const CUVIDPARSERDISPINFO& dispInfo);
+
+  CUvideoparser videoParser_ = nullptr;
+  UniqueCUvideodecoder decoder_;
+  CUVIDEOFORMAT videoFormat_ = {};
+
+  struct FrameBufferSlot {
+    CUVIDPARSERDISPINFO dispInfo;
+    int64_t guessedPts;
+    bool occupied = false;
+
+    FrameBufferSlot() : guessedPts(-1), occupied(false) {
+      memset(&dispInfo, 0, sizeof(dispInfo));
+    }
+  };
+
+  std::vector<FrameBufferSlot> frameBuffer_;
+  FrameBufferSlot* findEmptySlot();
+  FrameBufferSlot* findFrameWithExactPts(int64_t desiredPts);
+
+  std::queue<int64_t> packetsPtsQueue;
+
+  bool eofSent_ = false;
+
+  // Flush flag to prevent decode operations during flush (like DALI's
+  // isFlushing_)
+  bool isFlushing_ = false;
+
+  AVRational timeBase_ = {0, 0};
+
+  UniqueAVBSFContext bitstreamFilter_;
+
+  // Default CUDA interface for color conversion.
+  // TODONVDEC P2: we shouldn't need to keep a separate instance of the default.
+  // See other TODO there about how interfaces should be completely independent.
+  std::unique_ptr<DeviceInterface> defaultCudaInterface_;
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -98,7 +98,7 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-	    list(APPEND core_sources CudaDeviceInterface.cpp)
+	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp)
     endif()
 
     set(core_library_dependencies
@@ -111,6 +111,29 @@ function(make_torchcodec_libraries
             ${CUDA_nppi_LIBRARY}
             ${CUDA_nppicc_LIBRARY}
         )
+
+        # Try to find NVCUVID. Try the normal way first. This should work locally.
+        find_library(NVCUVID_LIBRARY NAMES nvcuvid)
+        # If not found, try with version suffix, or hardcoded path. Appears
+        # to be necessary on the CI.
+        if(NOT NVCUVID_LIBRARY)
+            find_library(NVCUVID_LIBRARY NAMES nvcuvid.1 PATHS /usr/lib64 /usr/lib)
+        endif()
+        if(NOT NVCUVID_LIBRARY)
+            set(NVCUVID_LIBRARY "/usr/lib64/libnvcuvid.so.1")
+        endif()
+
+        if(NVCUVID_LIBRARY)
+            message(STATUS "Found NVCUVID: ${NVCUVID_LIBRARY}")
+        else()
+            message(FATAL_ERROR "Could not find NVCUVID library")
+        endif()
+
+        # Add CUDA Driver library (needed for cuCtxGetCurrent, etc.)
+        find_library(CUDA_DRIVER_LIBRARY NAMES cuda REQUIRED)
+        message(STATUS "Found CUDA Driver library: ${CUDA_DRIVER_LIBRARY}")
+
+        list(APPEND core_library_dependencies ${NVCUVID_LIBRARY} ${CUDA_DRIVER_LIBRARY})
     endif()
 
     make_torchcodec_sublibrary(

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -13,6 +13,15 @@ extern "C" {
 #include <libavutil/pixdesc.h>
 }
 
+// TODONVDEC P1 Changes were made to this file to accomodate for the BETA CUDA
+// interface (see other TODONVDEC below). That's because the BETA CUDA interface
+// relies on this default CUDA interface to do the color conversion. That's
+// hacky, ugly, and leads to complicated code. We should refactor all this so
+// that an interface doesn't need to know anything about any other interface.
+// Note - this is more than just about the BETA CUDA interface: this default
+// interface already relies on the CPU interface to do software decoding when
+// needed, and that's already leading to similar complications.
+
 namespace facebook::torchcodec {
 namespace {
 
@@ -216,10 +225,11 @@ std::unique_ptr<FiltersContext> CudaDeviceInterface::initializeFiltersContext(
     return nullptr;
   }
 
-  TORCH_CHECK(
-      avFrame->hw_frames_ctx != nullptr,
-      "The AVFrame does not have a hw_frames_ctx. "
-      "That's unexpected, please report this to the TorchCodec repo.");
+  if (avFrame->hw_frames_ctx == nullptr) {
+    // TODONVDEC P2 return early for for beta interface where avFrames don't
+    // have a hw_frames_ctx. We should get rid of this or improve the logic.
+    return nullptr;
+  }
 
   auto hwFramesCtx =
       reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
@@ -347,22 +357,23 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   // Above we checked that the AVFrame was on GPU, but that's not enough, we
   // also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits),
   // because this is what the NPP color conversion routines expect.
-  TORCH_CHECK(
-      avFrame->hw_frames_ctx != nullptr,
-      "The AVFrame does not have a hw_frames_ctx. "
-      "That's unexpected, please report this to the TorchCodec repo.");
-
-  auto hwFramesCtx =
-      reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
-  AVPixelFormat actualFormat = hwFramesCtx->sw_format;
+  // TODONVDEC P2 this can be hit from the beta interface, but there's no
+  // hw_frames_ctx in this case. We should try to understand how that affects
+  // this validation.
+  AVHWFramesContext* hwFramesCtx = nullptr;
+  if (avFrame->hw_frames_ctx != nullptr) {
+    hwFramesCtx =
+        reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
+    AVPixelFormat actualFormat = hwFramesCtx->sw_format;
 
-  TORCH_CHECK(
-      actualFormat == AV_PIX_FMT_NV12,
-      "The AVFrame is ",
-      (av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
-                                         : "unknown"),
-      ", but we expected AV_PIX_FMT_NV12. "
-      "That's unexpected, please report this to the TorchCodec repo.");
+    TORCH_CHECK(
+        actualFormat == AV_PIX_FMT_NV12,
+        "The AVFrame is ",
+        (av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
+                                           : "unknown"),
+        ", but we expected AV_PIX_FMT_NV12. "
+        "That's unexpected, please report this to the TorchCodec repo.");
+  }
 
   auto frameDims =
       getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
@@ -396,19 +407,23 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   // arbitrary, but unfortunately we know it's hardcoded to be the default
   // stream by FFmpeg:
   // https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
-  TORCH_CHECK(
-      hwFramesCtx->device_ctx != nullptr,
-      "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
-  auto cudaDeviceCtx =
-      static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
-  at::cuda::CUDAEvent nvdecDoneEvent;
-  at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
-      c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
-  nvdecDoneEvent.record(nvdecStream);
-
-  // Don't start NPP work before NVDEC is done decoding the frame!
   at::cuda::CUDAStream nppStream = at::cuda::getCurrentCUDAStream(deviceIndex);
-  nvdecDoneEvent.block(nppStream);
+  if (hwFramesCtx) {
+    // TODONVDEC P2 this block won't be hit from the beta interface because
+    // there is no hwFramesCtx, but we should still make sure there's no CUDA
+    // stream sync issue in the beta interface.
+    TORCH_CHECK(
+        hwFramesCtx->device_ctx != nullptr,
+        "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
+    auto cudaDeviceCtx =
+        static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
+    at::cuda::CUDAEvent nvdecDoneEvent;
+    at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
+        c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
+    nvdecDoneEvent.record(nvdecStream);
+    // Don't start NPP work before NVDEC is done decoding the frame!
+    nvdecDoneEvent.block(nppStream);
+  }
 
   // Create the NPP context if we haven't yet.
   nppCtx_->hStream = nppStream.stream();

diff --git a/src/torchcodec/_core/DeviceInterface.cpp b/src/torchcodec/_core/DeviceInterface.cpp
@@ -11,7 +11,8 @@
 namespace facebook::torchcodec {
 
 namespace {
-using DeviceInterfaceMap = std::map<torch::DeviceType, CreateDeviceInterfaceFn>;
+using DeviceInterfaceMap =
+    std::map<DeviceInterfaceKey, CreateDeviceInterfaceFn>;
 static std::mutex g_interface_mutex;
 
 DeviceInterfaceMap& getDeviceMap() {
@@ -30,50 +31,79 @@ std::string getDeviceType(const std::string& device) {
 } // namespace
 
 bool registerDeviceInterface(
-    torch::DeviceType deviceType,
+    const DeviceInterfaceKey& key,
     CreateDeviceInterfaceFn createInterface) {
   std::scoped_lock lock(g_interface_mutex);
   DeviceInterfaceMap& deviceMap = getDeviceMap();
 
   TORCH_CHECK(
-      deviceMap.find(deviceType) == deviceMap.end(),
-      "Device interface already registered for ",
-      deviceType);
-  deviceMap.insert({deviceType, createInterface});
+      deviceMap.find(key) == deviceMap.end(),
+      "Device interface already registered for device type ",
+      key.deviceType,
+      " variant '",
+      key.variant,
+      "'");
+  deviceMap.insert({key, createInterface});
 
   return true;
 }
 
-torch::Device createTorchDevice(const std::string device) {
+bool registerDeviceInterface(
+    torch::DeviceType deviceType,
+    CreateDeviceInterfaceFn createInterface) {
+  return registerDeviceInterface(
+      DeviceInterfaceKey(deviceType), createInterface);
+}
+
+void validateDeviceInterface(
+    const std::string device,
+    const std::string variant) {
   std::scoped_lock lock(g_interface_mutex);
   std::string deviceType = getDeviceType(device);
+
   DeviceInterfaceMap& deviceMap = getDeviceMap();
 
+  // Find device interface that matches device type and variant
+  torch::DeviceType deviceTypeEnum = torch::Device(deviceType).type();
+
   auto deviceInterface = std::find_if(
       deviceMap.begin(),
       deviceMap.end(),
-      [&](const std::pair<torch::DeviceType, CreateDeviceInterfaceFn>& arg) {
-        return device.rfind(
-                   torch::DeviceTypeName(arg.first, /*lcase*/ true), 0) == 0;
+      [&](const std::pair<DeviceInterfaceKey, CreateDeviceInterfaceFn>& arg) {
+        return arg.first.deviceType == deviceTypeEnum &&
+            arg.first.variant == variant;
       });
-  TORCH_CHECK(
-      deviceInterface != deviceMap.end(), "Unsupported device: ", device);
 
-  return torch::Device(device);
+  TORCH_CHECK(
+      deviceInterface != deviceMap.end(),
+      "Unsupported device: ",
+      device,
+      " (device type: ",
+      deviceType,
+      ", variant: ",
+      variant,
+      ")");
 }
 
 std::unique_ptr<DeviceInterface> createDeviceInterface(
-    const torch::Device& device) {
-  auto deviceType = device.type();
+    const torch::Device& device,
+    const std::string_view variant) {
+  DeviceInterfaceKey key(device.type(), variant);
   std::scoped_lock lock(g_interface_mutex);
   DeviceInterfaceMap& deviceMap = getDeviceMap();
 
-  TORCH_CHECK(
-      deviceMap.find(deviceType) != deviceMap.end(),
-      "Unsupported device: ",
-      device);
+  auto it = deviceMap.find(key);
+  if (it != deviceMap.end()) {
+    return std::unique_ptr<DeviceInterface>(it->second(device));
+  }
 
-  return std::unique_ptr<DeviceInterface>(deviceMap[deviceType](device));
+  TORCH_CHECK(
+      false,
+      "No device interface found for device type: ",
+      device.type(),
+      " variant: '",
+      variant,
+      "'");
 }
 
 } // namespace facebook::torchcodec