remove GpuEncoder, use minimal deviceInterface implem

Dan-Flores · Dan-Flores · commit 9b81ede18484 · 2025-12-03T21:32:26.000Z
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -100,7 +100,7 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp GpuEncoder.cpp)
+	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)
     endif()
 
     set(core_library_dependencies
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -362,4 +362,125 @@ std::string CudaDeviceInterface::getDetails() {
       (usingCPUFallback_ ? "CPU fallback." : "NVDEC.");
 }
 
+// Below are methods for video encoding:
+namespace {
+// RGB to NV12 color conversion matrix for BT.601 limited range.
+// NPP ColorTwist function used below expects the limited range
+// color conversion matrix, and this matches FFmpeg's default behavior.
+const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
+    // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
+    {0.257f, 0.504f, 0.098f, 16.0f},
+    // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
+    {-0.148f, -0.291f, 0.439f, 128.0f},
+    // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
+    {0.439f, -0.368f, -0.071f, 128.0f}};
+} // namespace
+
+std::optional<UniqueAVFrame> CudaDeviceInterface::convertTensorToAVFrame(
+    const torch::Tensor& tensor,
+    [[maybe_unused]] AVPixelFormat targetFormat,
+    int frameIndex,
+    AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      tensor.dim() == 3 && tensor.size(0) == 3,
+      "Expected 3D RGB tensor (CHW format), got shape: ",
+      tensor.sizes());
+
+  UniqueAVFrame avFrame(av_frame_alloc());
+  TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
+  int height = static_cast<int>(tensor.size(1));
+  int width = static_cast<int>(tensor.size(2));
+
+  // TODO-VideoEncoder: Unify AVFrame creation with CPU version of this method
+  avFrame->format = AV_PIX_FMT_CUDA;
+  avFrame->height = height;
+  avFrame->width = width;
+  avFrame->pts = frameIndex;
+
+  // FFmpeg's av_hwframe_get_buffer is used to allocate memory on CUDA device.
+  // TODO-VideoEncoder: Consider using pytorch to allocate CUDA memory for
+  // efficiency
+  int ret =
+      av_hwframe_get_buffer(codecContext->hw_frames_ctx, avFrame.get(), 0);
+  TORCH_CHECK(
+      ret >= 0,
+      "Failed to allocate hardware frame: ",
+      getFFMPEGErrorStringFromErrorCode(ret));
+
+  TORCH_CHECK(
+      avFrame != nullptr && avFrame->data[0] != nullptr,
+      "avFrame must be pre-allocated with CUDA memory");
+
+  torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
+
+  at::cuda::CUDAStream currentStream =
+      at::cuda::getCurrentCUDAStream(device_.index());
+
+  nppCtx_->hStream = currentStream.stream();
+  cudaError_t cudaErr =
+      cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
+  TORCH_CHECK(
+      cudaErr == cudaSuccess,
+      "cudaStreamGetFlags failed: ",
+      cudaGetErrorString(cudaErr));
+
+  NppiSize oSizeROI = {width, height};
+  NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
+      static_cast<const Npp8u*>(hwcFrame.data_ptr()),
+      hwcFrame.stride(0) * hwcFrame.element_size(),
+      avFrame->data,
+      avFrame->linesize,
+      oSizeROI,
+      defaultLimitedRangeRgbToNv12,
+      *nppCtx_);
+
+  TORCH_CHECK(
+      status == NPP_SUCCESS,
+      "Failed to convert RGB to NV12: NPP error code ",
+      status);
+
+  // TODO-VideoEncoder: Enable configuration of color properties, similar to
+  // FFmpeg. Below are the default color properties used by FFmpeg.
+  avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
+  avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
+
+  return avFrame;
+}
+
+void CudaDeviceInterface::setupHardwareFrameContext(
+    AVCodecContext* codecContext) {
+  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
+  TORCH_CHECK(
+      hardwareDeviceCtx_, "Hardware device context has not been initialized");
+
+  AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
+  TORCH_CHECK(
+      hwFramesCtxRef != nullptr,
+      "Failed to allocate hardware frames context for codec");
+
+  // Always set pixel formats to options that support CUDA encoding.
+  // TODO-VideoEncoder: Enable user set pixel formats to be set and properly
+  // handled with NPP functions below
+  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
+  codecContext->pix_fmt = AV_PIX_FMT_CUDA;
+
+  AVHWFramesContext* hwFramesCtx =
+      reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
+  hwFramesCtx->format = codecContext->pix_fmt;
+  hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
+  hwFramesCtx->width = codecContext->width;
+  hwFramesCtx->height = codecContext->height;
+
+  int ret = av_hwframe_ctx_init(hwFramesCtxRef);
+  if (ret < 0) {
+    av_buffer_unref(&hwFramesCtxRef);
+    TORCH_CHECK(
+        false,
+        "Failed to initialize CUDA frames context for codec: ",
+        getFFMPEGErrorStringFromErrorCode(ret));
+  }
+
+  codecContext->hw_frames_ctx = hwFramesCtxRef;
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -41,6 +41,14 @@ class CudaDeviceInterface : public DeviceInterface {
 
   std::string getDetails() override;
 
+  std::optional<UniqueAVFrame> convertTensorToAVFrame(
+      const torch::Tensor& tensor,
+      AVPixelFormat targetFormat,
+      int frameIndex,
+      AVCodecContext* codecContext) override;
+
+  void setupHardwareFrameContext(AVCodecContext* codecContext) override;
+
  private:
   // Our CUDA decoding code assumes NV12 format. In order to handle other
   // kinds of input, we need to convert them to NV12. Our current implementation
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -138,6 +138,19 @@ class DeviceInterface {
     return "";
   }
 
+  // Function used for video encoding, only implemented in CudaDeviceInterface.
+  virtual std::optional<UniqueAVFrame> convertTensorToAVFrame(
+      [[maybe_unused]] const torch::Tensor& tensor,
+      [[maybe_unused]] AVPixelFormat targetFormat,
+      [[maybe_unused]] int frameIndex,
+      [[maybe_unused]] AVCodecContext* codecContext) {
+    return std::nullopt;
+  }
+
+  // Function used for video encoding, only implemented in CudaDeviceInterface.
+  virtual void setupHardwareFrameContext(
+      [[maybe_unused]] AVCodecContext* codecContext) {}
+
  protected:
   torch::Device device_;
   SharedAVCodecContext codecContext_;
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -5,6 +5,7 @@
 #include "torch/types.h"
 
 extern "C" {
+#include <libavutil/hwcontext.h>
 #include <libavutil/opt.h>
 #include <libavutil/pixdesc.h>
 }
@@ -724,8 +725,10 @@ VideoEncoder::VideoEncoder(
 
 void VideoEncoder::initializeEncoder(
     const VideoStreamOptions& videoStreamOptions) {
+  // Only create device interface when frames are on a CUDA device.
+  // Encoding on CPU is implemented in this file.
   if (frames_.device().is_cuda()) {
-    gpuEncoder_ = std::make_unique<GpuEncoder>(frames_.device());
+    deviceInterface_ = createDeviceInterface(frames_.device());
   }
   const AVCodec* avCodec = nullptr;
   // If codec arg is provided, find codec using logic similar to FFmpeg:
@@ -824,9 +827,9 @@ void VideoEncoder::initializeEncoder(
         0);
   }
 
-  if (gpuEncoder_) {
-    gpuEncoder_->registerHardwareDeviceWithCodec(avCodecContext_.get());
-    gpuEncoder_->setupHardwareFrameContext(avCodecContext_.get());
+  if (frames_.device().is_cuda()) {
+    deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
+    deviceInterface_->setupHardwareFrameContext(avCodecContext_.get());
   }
 
   int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
@@ -870,9 +873,16 @@ void VideoEncoder::encode() {
   for (int i = 0; i < numFrames; ++i) {
     torch::Tensor currFrame = frames_[i];
     UniqueAVFrame avFrame;
-    if (gpuEncoder_) {
-      avFrame = gpuEncoder_->convertTensorToAVFrame(
+    if (deviceInterface_) {
+      auto cudaFrame = deviceInterface_->convertTensorToAVFrame(
           currFrame, outPixelFormat_, i, avCodecContext_.get());
+      TORCH_CHECK(
+          cudaFrame.has_value(),
+          "convertTensorToAVFrame failed for frame ",
+          i,
+          "on device: ",
+          frames_.device());
+      avFrame = std::move(*cudaFrame);
     } else {
       avFrame = convertTensorToAVFrame(currFrame, i);
     }
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -5,7 +5,6 @@
 #include "AVIOContextHolder.h"
 #include "DeviceInterface.h"
 #include "FFMPEGCommon.h"
-#include "GpuEncoder.h"
 #include "StreamOptions.h"
 
 extern "C" {
@@ -185,7 +184,7 @@ class VideoEncoder {
   AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
 
   std::unique_ptr<AVIOContextHolder> avioContextHolder_;
-  std::unique_ptr<GpuEncoder> gpuEncoder_;
+  std::unique_ptr<DeviceInterface> deviceInterface_;
 
   bool encodeWasCalled_ = false;
   AVDictionary* avFormatOptions_ = nullptr;
diff --git a/src/torchcodec/_core/GpuEncoder.cpp b/src/torchcodec/_core/GpuEncoder.cpp
diff --git a/src/torchcodec/_core/GpuEncoder.h b/src/torchcodec/_core/GpuEncoder.h

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ function(make_torchcodec_libraries`
`100`	`100`	`)`
`101`	`101`
`102`	`102`	`if(ENABLE_CUDA)`
`103`		`- list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp GpuEncoder.cpp)`
	`103`	`+ list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)`
`104`	`104`	`endif()`
`105`	`105`
`106`	`106`	`set(core_library_dependencies`