meta-pytorch
diff --git a/‎src/torchcodec/_core/BetaCudaDeviceInterface.cpp‎
Lines changed: 576 additions & 0 deletions b/‎src/torchcodec/_core/BetaCudaDeviceInterface.cpp‎
Lines changed: 576 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/BetaCudaDeviceInterface.h‎
Lines changed: 129 additions & 0 deletions b/‎src/torchcodec/_core/BetaCudaDeviceInterface.h‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/CMakeLists.txt‎
Lines changed: 19 additions & 1 deletion b/‎src/torchcodec/_core/CMakeLists.txt‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎src/torchcodec/_core/CpuDeviceInterface.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/torchcodec/_core/CpuDeviceInterface.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/torchcodec/_core/CudaDeviceInterface.cpp‎
Lines changed: 71 additions & 51 deletions b/‎src/torchcodec/_core/CudaDeviceInterface.cpp‎
Lines changed: 71 additions & 51 deletions
@@ -0,0 +1,129 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+// BETA CUDA device interface that provides direct control over NVDEC
+// while keeping FFmpeg for demuxing. A lot of the logic, particularly the use
+// of a cache for the decoders, is inspired by DALI's implementation which is
+// APACHE 2.0:
+// https://github.com/NVIDIA/DALI/blob/c7539676a24a8e9e99a6e8665e277363c5445259/dali/operators/video/frames_decoder_gpu.cc#L1
+//
+// NVDEC / NVCUVID docs:
+// https://docs.nvidia.com/video-technologies/video-codec-sdk/13.0/nvdec-video-decoder-api-prog-guide/index.html#using-nvidia-video-decoder-nvdecode-api
+
+#pragma once
+
+#include "src/torchcodec/_core/Cache.h"
+#include "src/torchcodec/_core/DeviceInterface.h"
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/NVDECCache.h"
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <unordered_map>
+#include <vector>
+
+#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
+#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
+
+namespace facebook::torchcodec {
+
+class BetaCudaDeviceInterface : public DeviceInterface {
+ public:
+  explicit BetaCudaDeviceInterface(const torch::Device& device);
+  virtual ~BetaCudaDeviceInterface();
+
+  void initializeInterface(AVStream* stream) override;
+
+  void convertAVFrameToFrameOutput(
+      const VideoStreamOptions& videoStreamOptions,
+      const AVRational& timeBase,
+      UniqueAVFrame& avFrame,
+      FrameOutput& frameOutput,
+      std::optional<torch::Tensor> preAllocatedOutputTensor =
+          std::nullopt) override;
+
+  bool canDecodePacketDirectly() const override {
+    return true;
+  }
+
+  int sendPacket(ReferenceAVPacket& packet) override;
+  int receiveFrame(UniqueAVFrame& avFrame, int64_t desiredPts) override;
+  void flush() override;
+
+  // NVDEC callback functions (must be public for C callbacks)
+  int streamPropertyChange(CUVIDEOFORMAT* videoFormat);
+  int frameReadyForDecoding(CUVIDPICPARAMS* pPicParams);
+
+ private:
+  // Apply bitstream filter, modifies packet in-place
+  void applyBSF(ReferenceAVPacket& packet);
+
+  class FrameBuffer {
+   public:
+    struct Slot {
+      CUVIDPARSERDISPINFO dispInfo;
+      int64_t guessedPts;
+      bool occupied = false;
+
+      Slot() : guessedPts(-1), occupied(false) {
+        std::memset(&dispInfo, 0, sizeof(dispInfo));
+      }
+    };
+
+    // TODONVDEC P1: init size should probably be min_num_decode_surfaces from
+    // video format
+    FrameBuffer() : frameBuffer_(4) {}
+
+    ~FrameBuffer() = default;
+
+    Slot* findEmptySlot();
+    Slot* findFrameWithExactPts(int64_t desiredPts);
+
+    // Iterator support for range-based for loops
+    auto begin() {
+      return frameBuffer_.begin();
+    }
+
+    auto end() {
+      return frameBuffer_.end();
+    }
+
+   private:
+    std::vector<Slot> frameBuffer_;
+  };
+
+  UniqueAVFrame convertCudaFrameToAVFrame(
+      CUdeviceptr framePtr,
+      unsigned int pitch,
+      const CUVIDPARSERDISPINFO& dispInfo);
+
+  CUvideoparser videoParser_ = nullptr;
+  UniqueCUvideodecoder decoder_;
+  CUVIDEOFORMAT videoFormat_ = {};
+
+  FrameBuffer frameBuffer_;
+
+  std::queue<int64_t> packetsPtsQueue;
+
+  bool eofSent_ = false;
+
+  // Flush flag to prevent decode operations during flush (like DALI's
+  // isFlushing_)
+  bool isFlushing_ = false;
+
+  AVRational timeBase_ = {0, 0};
+
+  UniqueAVBSFContext bitstreamFilter_;
+
+  // Default CUDA interface for color conversion.
+  // TODONVDEC P2: we shouldn't need to keep a separate instance of the default.
+  // See other TODO there about how interfaces should be completely independent.
+  std::unique_ptr<DeviceInterface> defaultCudaInterface_;
+};
+
+} // namespace facebook::torchcodec
@@ -99,7 +99,7 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-	    list(APPEND core_sources CudaDeviceInterface.cpp)
+	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp)
     endif()
 
     set(core_library_dependencies
@@ -108,9 +108,27 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
+        # Try to find NVCUVID. Try the normal way first. This should work locally.
+        find_library(NVCUVID_LIBRARY NAMES nvcuvid)
+        # If not found, try with version suffix, or hardcoded path. Appears
+        # to be necessary on the CI.
+        if(NOT NVCUVID_LIBRARY)
+            find_library(NVCUVID_LIBRARY NAMES nvcuvid.1 PATHS /usr/lib64 /usr/lib)
+        endif()
+        if(NOT NVCUVID_LIBRARY)
+            set(NVCUVID_LIBRARY "/usr/lib64/libnvcuvid.so.1")
+        endif()
+
+        if(NVCUVID_LIBRARY)
+            message(STATUS "Found NVCUVID: ${NVCUVID_LIBRARY}")
+        else()
+            message(FATAL_ERROR "Could not find NVCUVID library")
+        endif()
+
         list(APPEND core_library_dependencies
             ${CUDA_nppi_LIBRARY}
             ${CUDA_nppicc_LIBRARY}
+            ${NVCUVID_LIBRARY}
         )
     endif()
 
 
@@ -10,7 +10,7 @@ namespace facebook::torchcodec {
 namespace {
 
 static bool g_cpu = registerDeviceInterface(
-    torch::kCPU,
+    DeviceInterfaceKey(torch::kCPU),
     [](const torch::Device& device) { return new CpuDeviceInterface(device); });
 
 } // namespace
 
@@ -13,11 +13,21 @@ extern "C" {
 #include <libavutil/pixdesc.h>
 }
 
+// TODONVDEC P1 Changes were made to this file to accomodate for the BETA CUDA
+// interface (see other TODONVDEC below). That's because the BETA CUDA interface
+// relies on this default CUDA interface to do the color conversion. That's
+// hacky, ugly, and leads to complicated code. We should refactor all this so
+// that an interface doesn't need to know anything about any other interface.
+// Note - this is more than just about the BETA CUDA interface: this default
+// interface already relies on the CPU interface to do software decoding when
+// needed, and that's already leading to similar complications.
+
 namespace facebook::torchcodec {
 namespace {
 
-static bool g_cuda =
-    registerDeviceInterface(torch::kCUDA, [](const torch::Device& device) {
+static bool g_cuda = registerDeviceInterface(
+    DeviceInterfaceKey(torch::kCUDA),
+    [](const torch::Device& device) {
       return new CudaDeviceInterface(device);
     });
 
@@ -193,13 +203,18 @@ CudaDeviceInterface::~CudaDeviceInterface() {
   }
 }
 
-void CudaDeviceInterface::initialize(
-    AVCodecContext* codecContext,
-    const AVRational& timeBase) {
-  TORCH_CHECK(ctx_, "FFmpeg HW device has not been initialized");
-  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
-  codecContext->hw_device_ctx = av_buffer_ref(ctx_.get());
-  timeBase_ = timeBase;
+void CudaDeviceInterface::initialize(const AVStream* avStream) {
+  TORCH_CHECK(avStream != nullptr, "avStream is null");
+  timeBase_ = avStream->time_base;
+
+  cpuInterface_ = createDeviceInterface(torch::kCPU);
+  TORCH_CHECK(
+      cpuInterface_ != nullptr, "Failed to create CPU device interface");
+  cpuInterface_->initialize(avStream);
+  cpuInterface_->initializeVideo(
+      VideoStreamOptions(),
+      {},
+      /*resizedOutputDims=*/std::nullopt);
 }
 
 void CudaDeviceInterface::initializeVideo(
@@ -209,6 +224,13 @@ void CudaDeviceInterface::initializeVideo(
   videoStreamOptions_ = videoStreamOptions;
 }
 
+void CudaDeviceInterface::registerHardwareDeviceWithCodec(
+    AVCodecContext* codecContext) {
+  TORCH_CHECK(ctx_, "FFmpeg HW device has not been initialized");
+  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
+  codecContext->hw_device_ctx = av_buffer_ref(ctx_.get());
+}
+
 UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24(
     UniqueAVFrame& avFrame) {
   // We need FFmpeg filters to handle those conversion cases which are not
@@ -222,6 +244,12 @@ UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24(
     return std::move(avFrame);
   }
 
+  if (avFrame->hw_frames_ctx == nullptr) {
+    // TODONVDEC P2 return early for for beta interface where avFrames don't
+    // have a hw_frames_ctx. We should get rid of this or improve the logic.
+    return std::move(avFrame);
+  }
+
   auto hwFramesCtx =
       reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
   TORCH_CHECK(
@@ -351,19 +379,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     } else {
       // Reason 2 above. We need to do a full conversion which requires an
       // actual CPU device.
-      //
-      // TODO: Perhaps we should cache cpuInterface?
-      auto cpuInterface = createDeviceInterface(torch::kCPU);
-      TORCH_CHECK(
-          cpuInterface != nullptr, "Failed to create CPU device interface");
-      cpuInterface->initialize(
-          /*codecContext=*/nullptr, timeBase_);
-      cpuInterface->initializeVideo(
-          VideoStreamOptions(),
-          {},
-          /*resizedOutputDims=*/std::nullopt);
-
-      cpuInterface->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
+      cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
     }
 
     // Finally, we need to send the frame back to the GPU. Note that the
@@ -383,22 +399,23 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   // also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits),
   // because this is what the NPP color conversion routines expect. This SHOULD
   // be enforced by our call to maybeConvertAVFrameToNV12OrRGB24() above.
-  auto hwFramesCtx =
-      reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
-  TORCH_CHECK(
-      hwFramesCtx != nullptr,
-      "The AVFrame does not have a hw_frames_ctx. "
-      "That's unexpected, please report this to the TorchCodec repo.");
-
-  AVPixelFormat actualFormat = hwFramesCtx->sw_format;
+  // TODONVDEC P2 this can be hit from the beta interface, but there's no
+  // hw_frames_ctx in this case. We should try to understand how that affects
+  // this validation.
+  AVHWFramesContext* hwFramesCtx = nullptr;
+  if (avFrame->hw_frames_ctx != nullptr) {
+    hwFramesCtx =
+        reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
+    AVPixelFormat actualFormat = hwFramesCtx->sw_format;
 
-  TORCH_CHECK(
-      actualFormat == AV_PIX_FMT_NV12,
-      "The AVFrame is ",
-      (av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
-                                         : "unknown"),
-      ", but we expected AV_PIX_FMT_NV12. "
-      "That's unexpected, please report this to the TorchCodec repo.");
+    TORCH_CHECK(
+        actualFormat == AV_PIX_FMT_NV12,
+        "The AVFrame is ",
+        (av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
+                                           : "unknown"),
+        ", but we expected AV_PIX_FMT_NV12. "
+        "That's unexpected, please report this to the TorchCodec repo.");
+  }
 
   torch::Tensor& dst = frameOutput.data;
   if (preAllocatedOutputTensor.has_value()) {
@@ -418,21 +435,24 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   // arbitrary, but unfortunately we know it's hardcoded to be the default
   // stream by FFmpeg:
   // https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
-  TORCH_CHECK(
-      hwFramesCtx->device_ctx != nullptr,
-      "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
-  auto cudaDeviceCtx =
-      static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
-  TORCH_CHECK(cudaDeviceCtx != nullptr, "The hardware context is null");
-
-  at::cuda::CUDAEvent nvdecDoneEvent;
-  at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
-      c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
-  nvdecDoneEvent.record(nvdecStream);
-
-  // Don't start NPP work before NVDEC is done decoding the frame!
   at::cuda::CUDAStream nppStream = at::cuda::getCurrentCUDAStream(deviceIndex);
-  nvdecDoneEvent.block(nppStream);
+  if (hwFramesCtx) {
+    // TODONVDEC P2 this block won't be hit from the beta interface because
+    // there is no hwFramesCtx, but we should still make sure there's no CUDA
+    // stream sync issue in the beta interface.
+    TORCH_CHECK(
+        hwFramesCtx->device_ctx != nullptr,
+        "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
+    auto cudaDeviceCtx =
+        static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
+    TORCH_CHECK(cudaDeviceCtx != nullptr, "The hardware context is null");
+    at::cuda::CUDAEvent nvdecDoneEvent;
+    at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
+        c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
+    nvdecDoneEvent.record(nvdecStream);
+    // Don't start NPP work before NVDEC is done decoding the frame!
+    nvdecDoneEvent.block(nppStream);
+  }
 
   // Create the NPP context if we haven't yet.
   nppCtx_->hStream = nppStream.stream();