move more encoding to gpuEncoder.cpp, reduce diff

Dan-Flores · Dan-Flores · commit 7e5e6d4432cb · 2025-11-26T23:44:12.000Z
diff --git a/src/torchcodec/_core/CUDACommon.cpp b/src/torchcodec/_core/CUDACommon.cpp
@@ -156,21 +156,6 @@ const Npp32f bt709FullRangeColorTwist[3][4] = {
     {1.0f, -0.187324273f, -0.468124273f, -128.0f},
     {1.0f, 1.8556f, 0.0f, -128.0f}};
 
-// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
-// Note: NPP's ColorTwist function apparently expects "limited range"
-// coefficient format even when producing full range output. All matrices below
-// use the limited range coefficient format (Y with +16 offset) for NPP
-// compatibility.
-
-// BT.601 limited range (matches FFmpeg default behavior)
-const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
-    // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
-    {0.257f, 0.504f, 0.098f, 16.0f},
-    // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
-    {-0.148f, -0.291f, 0.439f, 128.0f},
-    // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
-    {0.439f, -0.368f, -0.071f, 128.0f}};
-
 torch::Tensor convertNV12FrameToRGB(
     UniqueAVFrame& avFrame,
     const torch::Device& device,
@@ -261,68 +246,6 @@ torch::Tensor convertNV12FrameToRGB(
   return dst;
 }
 
-void convertRGBTensorToNV12Frame(
-    const torch::Tensor& rgbTensor,
-    UniqueAVFrame& nv12Frame,
-    const torch::Device& device,
-    const UniqueNppContext& nppCtx,
-    at::cuda::CUDAStream inputStream) {
-  TORCH_CHECK(rgbTensor.is_cuda(), "RGB tensor must be on CUDA device");
-  TORCH_CHECK(
-      rgbTensor.dim() == 3 && rgbTensor.size(0) == 3,
-      "Expected 3D RGB tensor in CHW format, got shape: ",
-      rgbTensor.sizes());
-  TORCH_CHECK(
-      nv12Frame != nullptr && nv12Frame->data[0] != nullptr,
-      "nv12Frame must be pre-allocated with CUDA memory");
-
-  // Convert CHW to HWC for NPP processing
-  int height = static_cast<int>(rgbTensor.size(1));
-  int width = static_cast<int>(rgbTensor.size(2));
-  torch::Tensor hwcFrame = rgbTensor.permute({1, 2, 0}).contiguous();
-
-  // Set up stream synchronization - make NPP stream wait for input tensor
-  // operations
-  at::cuda::CUDAStream nppStream =
-      at::cuda::getCurrentCUDAStream(device.index());
-  at::cuda::CUDAEvent inputDoneEvent;
-  inputDoneEvent.record(inputStream);
-  inputDoneEvent.block(nppStream);
-
-  // Setup NPP context
-  nppCtx->hStream = nppStream.stream();
-  cudaError_t cudaErr =
-      cudaStreamGetFlags(nppCtx->hStream, &nppCtx->nStreamFlags);
-  TORCH_CHECK(
-      cudaErr == cudaSuccess,
-      "cudaStreamGetFlags failed: ",
-      cudaGetErrorString(cudaErr));
-
-  // Always use FFmpeg's default behavior: BT.601 limited range
-  NppiSize oSizeROI = {width, height};
-
-  NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
-      static_cast<const Npp8u*>(hwcFrame.data_ptr()),
-      hwcFrame.stride(0) * hwcFrame.element_size(),
-      nv12Frame->data,
-      nv12Frame->linesize,
-      oSizeROI,
-      defaultLimitedRangeRgbToNv12,
-      *nppCtx);
-
-  TORCH_CHECK(
-      status == NPP_SUCCESS,
-      "Failed to convert RGB to NV12: NPP error code ",
-      status);
-
-  // Validate CUDA operations completed successfully
-  cudaError_t memCheck = cudaGetLastError();
-  TORCH_CHECK(
-      memCheck == cudaSuccess,
-      "CUDA error detected: ",
-      cudaGetErrorString(memCheck));
-}
-
 UniqueNppContext getNppStreamContext(const torch::Device& device) {
   int deviceIndex = getDeviceIndex(device);
 
diff --git a/src/torchcodec/_core/CUDACommon.h b/src/torchcodec/_core/CUDACommon.h
@@ -37,13 +37,6 @@ torch::Tensor convertNV12FrameToRGB(
     at::cuda::CUDAStream nvdecStream,
     std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
-void convertRGBTensorToNV12Frame(
-    const torch::Tensor& rgbTensor,
-    UniqueAVFrame& nv12Frame,
-    const torch::Device& device,
-    const UniqueNppContext& nppCtx,
-    at::cuda::CUDAStream inputStream);
-
 UniqueNppContext getNppStreamContext(const torch::Device& device);
 void returnNppStreamContextToCache(
     const torch::Device& device,
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -18,6 +18,11 @@ class CpuDeviceInterface : public DeviceInterface {
 
   virtual ~CpuDeviceInterface() {}
 
+  std::optional<const AVCodec*> findCodec(
+      [[maybe_unused]] const AVCodecID& codecId) override {
+    return std::nullopt;
+  }
+
   virtual void initialize(
       const AVStream* avStream,
       const UniqueDecodingAVFormatContext& avFormatCtx,
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -335,7 +335,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
 // we have to do this because of an FFmpeg bug where hardware decoding is not
 // appropriately set, so we just go off and find the matching codec for the CUDA
 // device
-std::optional<const AVCodec*> CudaDeviceInterface::findDecoder(
+std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
     const AVCodecID& codecId) {
   void* i = nullptr;
   const AVCodec* codec = nullptr;
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -18,7 +18,7 @@ class CudaDeviceInterface : public DeviceInterface {
 
   virtual ~CudaDeviceInterface();
 
-  std::optional<const AVCodec*> findDecoder(const AVCodecID& codecId) override;
+  std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) override;
 
   void initialize(
       const AVStream* avStream,
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -46,7 +46,7 @@ class DeviceInterface {
     return device_;
   };
 
-  virtual std::optional<const AVCodec*> findDecoder(
+  virtual std::optional<const AVCodec*> findCodec(
       [[maybe_unused]] const AVCodecID& codecId) {
     return std::nullopt;
   };
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -894,8 +894,7 @@ void VideoEncoder::encode() {
       avFrame = gpuEncoder_->convertTensorToAVFrame(
           currFrame, outPixelFormat_, i, avCodecContext_.get());
     } else {
-      // Use direct CPU conversion for CPU devices
-      avFrame = convertCpuTensorToAVFrame(currFrame, i);
+      avFrame = convertTensorToAVFrame(currFrame, i);
     }
     encodeFrame(autoAVPacket, avFrame);
   }
@@ -909,24 +908,25 @@ void VideoEncoder::encode() {
       getFFMPEGErrorStringFromErrorCode(status));
 }
 
-UniqueAVFrame VideoEncoder::convertCpuTensorToAVFrame(
-    const torch::Tensor& tensor,
+UniqueAVFrame VideoEncoder::convertTensorToAVFrame(
+    const torch::Tensor& frame,
     int frameIndex) {
-  TORCH_CHECK(tensor.is_cpu(), "CPU encoder requires CPU tensors");
+  TORCH_CHECK(frame.is_cpu(), "CPU encoder requires CPU tensors");
   TORCH_CHECK(
-      tensor.dim() == 3 && tensor.size(0) == 3,
+      frame.dim() == 3 && frame.size(0) == 3,
       "Expected 3D RGB tensor (CHW format), got shape: ",
-      tensor.sizes());
+      frame.sizes());
 
-  inHeight_ = static_cast<int>(tensor.sizes()[1]);
-  inWidth_ = static_cast<int>(tensor.sizes()[2]);
+  // These are all already set in initializeEncoder?
+  // inHeight_ = static_cast<int>(tensor.sizes()[1]);
+  // inWidth_ = static_cast<int>(tensor.sizes()[2]);
 
-  // For now, reuse input dimensions as output dimensions
-  outWidth_ = inWidth_;
-  outHeight_ = inHeight_;
+  // // For now, reuse input dimensions as output dimensions
+  // outWidth_ = inWidth_;
+  // outHeight_ = inHeight_;
 
-  // Input format is RGB planar (AV_PIX_FMT_GBRP after channel reordering)
-  inPixelFormat_ = AV_PIX_FMT_GBRP;
+  // // Input format is RGB planar (AV_PIX_FMT_GBRP after channel reordering)
+  // inPixelFormat_ = AV_PIX_FMT_GBRP;
 
   // Initialize and cache scaling context if it does not exist
   if (!swsContext_) {
@@ -965,15 +965,15 @@ UniqueAVFrame VideoEncoder::convertCpuTensorToAVFrame(
   inputFrame->width = inWidth_;
   inputFrame->height = inHeight_;
 
-  uint8_t* tensorData = static_cast<uint8_t*>(tensor.data_ptr());
+  uint8_t* tensorData = static_cast<uint8_t*>(frame.data_ptr());
 
   // TODO-VideoEncoder: Reorder tensor if in NHWC format
   int channelSize = inHeight_ * inWidth_;
   // Reorder RGB -> GBR for AV_PIX_FMT_GBRP format
   // TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format
-  inputFrame->data[0] = tensorData + channelSize; // G channel
-  inputFrame->data[1] = tensorData + (2 * channelSize); // B channel
-  inputFrame->data[2] = tensorData; // R channel
+  inputFrame->data[0] = tensorData + channelSize;
+  inputFrame->data[1] = tensorData + (2 * channelSize);
+  inputFrame->data[2] = tensorData;
 
   inputFrame->linesize[0] = inWidth_;
   inputFrame->linesize[1] = inWidth_;
@@ -988,7 +988,6 @@ UniqueAVFrame VideoEncoder::convertCpuTensorToAVFrame(
       avFrame->data,
       avFrame->linesize);
   TORCH_CHECK(status == outHeight_, "sws_scale failed");
-
   return avFrame;
 }
 
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -162,14 +162,12 @@ class VideoEncoder {
 
  private:
   void initializeEncoder(const VideoStreamOptions& videoStreamOptions);
+  UniqueAVFrame convertTensorToAVFrame(
+      const torch::Tensor& frame,
+      int frameIndex);
   void encodeFrame(AutoAVPacket& autoAVPacket, const UniqueAVFrame& avFrame);
   void flushBuffers();
 
-  // CPU tensor-to-frame conversion for CPU encoding
-  UniqueAVFrame convertCpuTensorToAVFrame(
-      const torch::Tensor& tensor,
-      int frameIndex);
-
   UniqueEncodingAVFormatContext avFormatContext_;
   UniqueAVCodecContext avCodecContext_;
   AVStream* avStream_ = nullptr;
@@ -187,7 +185,6 @@ class VideoEncoder {
   AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
 
   std::unique_ptr<AVIOContextHolder> avioContextHolder_;
-  std::unique_ptr<DeviceInterface> deviceInterface_;
   std::unique_ptr<GpuEncoder> gpuEncoder_;
 
   bool encodeWasCalled_ = false;
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -40,7 +40,7 @@ AVPacket* ReferenceAVPacket::operator->() {
 
 AVCodecOnlyUseForCallingAVFindBestStream
 makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
-#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100) // FFmpeg < 5.0.3
+#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
   return const_cast<AVCodec*>(codec);
 #else
   return codec;
diff --git a/src/torchcodec/_core/GpuEncoder.cpp b/src/torchcodec/_core/GpuEncoder.cpp
@@ -69,6 +69,20 @@ UniqueAVBufferRef createHardwareDeviceContext(const torch::Device& device) {
   return UniqueAVBufferRef(hardwareDeviceCtxRaw);
 }
 
+// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
+// Note: NPP's ColorTwist function apparently expects "limited range"
+// coefficient format even when producing full range output. All matrices below
+// use the limited range coefficient format (Y with +16 offset) for NPP
+// compatibility.
+
+// BT.601 limited range (matches FFmpeg default behavior)
+const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
+    // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
+    {0.257f, 0.504f, 0.098f, 16.0f},
+    // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
+    {-0.148f, -0.291f, 0.439f, 128.0f},
+    // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
+    {0.439f, -0.368f, -0.071f, 128.0f}};
 } // anonymous namespace
 
 GpuEncoder::GpuEncoder(const torch::Device& device) : device_(device) {
@@ -155,14 +169,6 @@ UniqueAVFrame GpuEncoder::convertTensorToAVFrame(
       tensor.dim() == 3 && tensor.size(0) == 3,
       "Expected 3D RGB tensor (CHW format), got shape: ",
       tensor.sizes());
-
-  return convertRGBTensorToNV12Frame(tensor, frameIndex, codecContext);
-}
-
-UniqueAVFrame GpuEncoder::convertRGBTensorToNV12Frame(
-    const torch::Tensor& tensor,
-    int frameIndex,
-    AVCodecContext* codecContext) {
   UniqueAVFrame avFrame(av_frame_alloc());
   TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
 
@@ -178,13 +184,55 @@ UniqueAVFrame GpuEncoder::convertRGBTensorToNV12Frame(
       "Failed to allocate hardware frame: ",
       getFFMPEGErrorStringFromErrorCode(ret));
 
+  // Validate that avFrame was properly allocated with CUDA memory
+  TORCH_CHECK(
+      avFrame != nullptr && avFrame->data[0] != nullptr,
+      "avFrame must be pre-allocated with CUDA memory");
+
+  // Convert CHW to HWC for NPP processing
+  int height = static_cast<int>(tensor.size(1));
+  int width = static_cast<int>(tensor.size(2));
+  torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
+
+  // Get current CUDA stream for NPP operations
   at::cuda::CUDAStream currentStream =
       at::cuda::getCurrentCUDAStream(device_.index());
 
-  facebook::torchcodec::convertRGBTensorToNV12Frame(
-      tensor, avFrame, device_, nppCtx_, currentStream);
+  // Setup NPP context with current stream
+  nppCtx_->hStream = currentStream.stream();
+  cudaError_t cudaErr =
+      cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
+  TORCH_CHECK(
+      cudaErr == cudaSuccess,
+      "cudaStreamGetFlags failed: ",
+      cudaGetErrorString(cudaErr));
+
+  // Always use FFmpeg's default behavior: BT.601 limited range
+  NppiSize oSizeROI = {width, height};
+
+  NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
+      static_cast<const Npp8u*>(hwcFrame.data_ptr()),
+      hwcFrame.stride(0) * hwcFrame.element_size(),
+      avFrame->data,
+      avFrame->linesize,
+      oSizeROI,
+      defaultLimitedRangeRgbToNv12,
+      *nppCtx_);
+
+  TORCH_CHECK(
+      status == NPP_SUCCESS,
+      "Failed to convert RGB to NV12: NPP error code ",
+      status);
+
+  // Validate CUDA operations completed successfully
+  cudaError_t memCheck = cudaGetLastError();
+  TORCH_CHECK(
+      memCheck == cudaSuccess,
+      "CUDA error detected: ",
+      cudaGetErrorString(memCheck));
 
-  // Set color properties to FFmpeg defaults
+  // TODO-VideoEncoder: Enable configuration of color properties, similar to
+  // FFmpeg Set color properties to FFmpeg defaults
   avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
   avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
 
diff --git a/src/torchcodec/_core/GpuEncoder.h b/src/torchcodec/_core/GpuEncoder.h
@@ -47,12 +47,6 @@ class GpuEncoder {
   UniqueNppContext nppCtx_;
 
   void initializeHardwareContext();
-  void setupHardwareFrameContext(AVCodecContext* codecContext);
-
-  UniqueAVFrame convertRGBTensorToNV12Frame(
-      const torch::Tensor& tensor,
-      int frameIndex,
-      AVCodecContext* codecContext);
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -462,7 +462,7 @@ void SingleStreamDecoder::addStream(
   // addStream() which is supposed to be generic
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
     avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
-        deviceInterface_->findDecoder(streamInfo.stream->codecpar->codec_id)
+        deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id)
             .value_or(avCodec));
   }
 
diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py
@@ -18,7 +18,6 @@ class VideoEncoder:
         frame_rate (float): The frame rate of the **input** ``frames``. Also defines the encoded **output** frame rate.
         device (str or torch.device, optional): The device to use for encoding. Default: "cpu".
             If you pass a CUDA device, frames will be encoded on GPU.
-            Note: The "beta" CUDA backend is not supported for encoding.
     """
 
     def __init__(