Docs

NicolasHug · NicolasHug · commit 340974ab6a0a · 2025-10-21T23:36:35.000+01:00
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -212,6 +212,12 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
   return true;
 }
 
+// Callback for freeing CUDA memory associated with AVFrame see where it's used
+// for more details.
+void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
+  cudaFree(opaque);
+}
+
 } // namespace
 
 BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
@@ -665,20 +671,23 @@ void BetaCudaDeviceInterface::flush() {
   std::swap(readyFrames_, emptyQueue);
 }
 
-namespace {
-// Cleanup callback for CUDA memory allocated for GPU frames
-void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
-  cudaFree(opaque);
-}
-} // namespace
-
 UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
     UniqueAVFrame& cpuFrame) {
+  // This is called in the context of the CPU fallback: the frame was decoded on
+  // the CPU, and in this function we convert that frame into NV12 format and
+  // send it to the GPU.
+  // We do that in 2 steps:
+  // - First we convert the input CPU frame into an intermediate NV12 CPU frame
+  //   using sws_scale.
+  // - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
+  //   is what we return
+
   TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");
 
   int width = cpuFrame->width;
   int height = cpuFrame->height;
 
+  // intermediate NV12 CPU frame. It's not on the GPU yet.
   UniqueAVFrame nv12CpuFrame(av_frame_alloc());
   TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");
 
@@ -707,7 +716,7 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
 
   int convertedHeight = sws_scale(
       swsContext_.get(),
-      const_cast<const uint8_t* const*>(cpuFrame->data),
+      cpuFrame->data,
       cpuFrame->linesize,
       0,
       height,
@@ -739,6 +748,9 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
   gpuFrame->linesize[0] = width;
   gpuFrame->linesize[1] = width;
 
+  // Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
+  // linesizes (strides) may be different than the widths for the input CPU
+  // frame. That's precisely what cudaMemcpy2D is for.
   err = cudaMemcpy2D(
       gpuFrame->data[0],
       gpuFrame->linesize[0],
@@ -771,10 +783,16 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
       "Failed to copy frame properties: ",
       getFFMPEGErrorStringFromErrorCode(ret));
 
+  // We're almost done, but we need to make sure the CUDA memory is freed
+  // properly. Usually, AVFrame data is freed when av_frame_free() is called
+  // (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
+  // ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
+  // with this is to associate the opaque_ref field of the AVFrame with a `free`
+  // callback that will then be called by av_frame_free().
   gpuFrame->opaque_ref = av_buffer_create(
-      nullptr, // data
+      nullptr, // data - we don't need any
       0, // data size
-      cudaBufferFreeCallback,  // callback triggered by av_frame_free()
+      cudaBufferFreeCallback, // callback triggered by av_frame_free()
       cudaBuffer, // parameter to callback
       0); // flags
   TORCH_CHECK(
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -81,7 +81,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
       unsigned int pitch,
       const CUVIDPARSERDISPINFO& dispInfo);
 
-  // Convert CPU frame to GPU NV12 frame for GPU color conversion
   UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame);
 
   CUvideoparser videoParser_ = nullptr;
@@ -100,11 +99,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   // NPP context for color conversion
   UniqueNppContext nppCtx_;
 
-  // Swscale context caching for CPU->GPU NV12 conversion
+  std::unique_ptr<DeviceInterface> cpuFallback_;
   UniqueSwsContext swsContext_;
   SwsFrameContext prevSwsFrameContext_;
-
-  std::unique_ptr<DeviceInterface> cpuFallback_;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
@@ -104,34 +104,6 @@ using UniqueAVBufferSrcParameters = std::unique_ptr<
     AVBufferSrcParameters,
     Deleterv<AVBufferSrcParameters, void, av_freep>>;
 
-// Common swscale context management for efficient reuse across device
-// interfaces
-struct SwsFrameContext {
-  int inputWidth = 0;
-  int inputHeight = 0;
-  AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
-  int outputWidth = 0;
-  int outputHeight = 0;
-
-  SwsFrameContext() = default;
-  SwsFrameContext(
-      int inputWidth,
-      int inputHeight,
-      AVPixelFormat inputFormat,
-      int outputWidth,
-      int outputHeight);
-
-  bool operator==(const SwsFrameContext& other) const;
-  bool operator!=(const SwsFrameContext& other) const;
-};
-
-// Utility functions for swscale context management
-UniqueSwsContext createSwsContext(
-    const SwsFrameContext& swsFrameContext,
-    AVColorSpace colorspace,
-    AVPixelFormat outputFormat = AV_PIX_FMT_RGB24,
-    int swsFlags = SWS_BILINEAR);
-
 // These 2 classes share the same underlying AVPacket object. They are meant to
 // be used in tandem, like so:
 //
@@ -279,4 +251,30 @@ AVFilterContext* createBuffersinkFilter(
     AVFilterGraph* filterGraph,
     enum AVPixelFormat outputFormat);
 
+struct SwsFrameContext {
+  int inputWidth = 0;
+  int inputHeight = 0;
+  AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
+  int outputWidth = 0;
+  int outputHeight = 0;
+
+  SwsFrameContext() = default;
+  SwsFrameContext(
+      int inputWidth,
+      int inputHeight,
+      AVPixelFormat inputFormat,
+      int outputWidth,
+      int outputHeight);
+
+  bool operator==(const SwsFrameContext& other) const;
+  bool operator!=(const SwsFrameContext& other) const;
+};
+
+// Utility functions for swscale context management
+UniqueSwsContext createSwsContext(
+    const SwsFrameContext& swsFrameContext,
+    AVColorSpace colorspace,
+    AVPixelFormat outputFormat = AV_PIX_FMT_RGB24,
+    int swsFlags = SWS_BILINEAR);
+
 } // namespace facebook::torchcodec