reduce files affected, add GpuEncoder.cpp

Dan-Flores · Dan-Flores · commit 9c7bae7eff4a · 2025-11-26T05:18:12.000Z
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -833,16 +833,6 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
       gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
 }
 
-UniqueAVFrame BetaCudaDeviceInterface::convertTensorToAVFrame(
-    [[maybe_unused]] const torch::Tensor& tensor,
-    [[maybe_unused]] AVPixelFormat targetFormat,
-    [[maybe_unused]] int frameIndex,
-    [[maybe_unused]] AVCodecContext* codecContext) {
-  TORCH_CHECK(
-      false,
-      "Beta CUDA device interface does not support video encoding currently.");
-}
-
 std::string BetaCudaDeviceInterface::getDetails() {
   std::string details = "Beta CUDA Device Interface.";
   if (cpuFallback_) {
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -48,12 +48,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
-  UniqueAVFrame convertTensorToAVFrame(
-      const torch::Tensor& tensor,
-      AVPixelFormat targetFormat,
-      int frameIndex,
-      AVCodecContext* codecContext) override;
-
   int sendPacket(ReferenceAVPacket& packet) override;
   int sendEOFPacket() override;
   int receiveFrame(UniqueAVFrame& avFrame) override;
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -100,7 +100,7 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)
+	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp GpuEncoder.cpp)
     endif()
 
     set(core_library_dependencies
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -429,84 +429,6 @@ std::optional<torch::Tensor> CpuDeviceInterface::maybeFlushAudioBuffers() {
       /*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples);
 }
 
-UniqueAVFrame CpuDeviceInterface::convertTensorToAVFrame(
-    const torch::Tensor& frame,
-    AVPixelFormat outPixelFormat,
-    int frameIndex,
-    [[maybe_unused]] AVCodecContext* codecContext) {
-  int inHeight = static_cast<int>(frame.sizes()[1]);
-  int inWidth = static_cast<int>(frame.sizes()[2]);
-
-  // For now, reuse input dimensions as output dimensions
-  int outWidth = inWidth;
-  int outHeight = inHeight;
-
-  // Input format is RGB planar (AV_PIX_FMT_GBRP after channel reordering)
-  AVPixelFormat inPixelFormat = AV_PIX_FMT_GBRP;
-
-  // Initialize and cache scaling context if it does not exist
-  if (!swsContext_) {
-    swsContext_.reset(sws_getContext(
-        inWidth,
-        inHeight,
-        inPixelFormat,
-        outWidth,
-        outHeight,
-        outPixelFormat,
-        SWS_BICUBIC, // Used by FFmpeg CLI
-        nullptr,
-        nullptr,
-        nullptr));
-    TORCH_CHECK(swsContext_ != nullptr, "Failed to create scaling context");
-  }
-
-  UniqueAVFrame avFrame(av_frame_alloc());
-  TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
-
-  // Set output frame properties
-  avFrame->format = outPixelFormat;
-  avFrame->width = outWidth;
-  avFrame->height = outHeight;
-  avFrame->pts = frameIndex;
-
-  int status = av_frame_get_buffer(avFrame.get(), 0);
-  TORCH_CHECK(status >= 0, "Failed to allocate frame buffer");
-
-  // Need to convert/scale the frame
-  // Create temporary frame with input format
-  UniqueAVFrame inputFrame(av_frame_alloc());
-  TORCH_CHECK(inputFrame != nullptr, "Failed to allocate input AVFrame");
-
-  inputFrame->format = inPixelFormat;
-  inputFrame->width = inWidth;
-  inputFrame->height = inHeight;
-
-  uint8_t* tensorData = static_cast<uint8_t*>(frame.data_ptr());
-
-  // TODO-VideoEncoder: Reorder tensor if in NHWC format
-  int channelSize = inHeight * inWidth;
-  // Reorder RGB -> GBR for AV_PIX_FMT_GBRP format
-  // TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format
-  inputFrame->data[0] = tensorData + channelSize;
-  inputFrame->data[1] = tensorData + (2 * channelSize);
-  inputFrame->data[2] = tensorData;
-
-  inputFrame->linesize[0] = inWidth;
-  inputFrame->linesize[1] = inWidth;
-  inputFrame->linesize[2] = inWidth;
-
-  status = sws_scale(
-      swsContext_.get(),
-      inputFrame->data,
-      inputFrame->linesize,
-      0,
-      inputFrame->height,
-      avFrame->data,
-      avFrame->linesize);
-  TORCH_CHECK(status == outHeight, "sws_scale failed");
-  return avFrame;
-}
-
 std::string CpuDeviceInterface::getDetails() {
   return std::string("CPU Device Interface.");
 }
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -38,12 +38,6 @@ class CpuDeviceInterface : public DeviceInterface {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
-  UniqueAVFrame convertTensorToAVFrame(
-      const torch::Tensor& tensor,
-      AVPixelFormat targetFormat,
-      int frameIndex,
-      AVCodecContext* codecContext) override;
-
   std::string getDetails() override;
 
  private:
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -146,40 +146,6 @@ void CudaDeviceInterface::registerHardwareDeviceWithCodec(
   codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
 }
 
-void CudaDeviceInterface::setupEncodingContext(AVCodecContext* codecContext) {
-  TORCH_CHECK(
-      hardwareDeviceCtx_, "Hardware device context has not been initialized");
-  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
-  // is there any way to preserve actual desired format?
-  // codecContext->sw_pix_fmt = codecContext->pix_fmt;
-  // Should we always produce AV_PIX_FMT_NV12?
-  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
-  codecContext->pix_fmt = AV_PIX_FMT_CUDA;
-
-  AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
-  TORCH_CHECK(
-      hwFramesCtxRef != nullptr,
-      "Failed to allocate hardware frames context for codec");
-
-  AVHWFramesContext* hwFramesCtx =
-      reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
-  hwFramesCtx->format = codecContext->pix_fmt;
-  hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
-  hwFramesCtx->width = codecContext->width;
-  hwFramesCtx->height = codecContext->height;
-
-  int ret = av_hwframe_ctx_init(hwFramesCtxRef);
-  if (ret < 0) {
-    av_buffer_unref(&hwFramesCtxRef);
-    TORCH_CHECK(
-        false,
-        "Failed to initialize CUDA frames context for codec: ",
-        getFFMPEGErrorStringFromErrorCode(ret));
-  }
-
-  codecContext->hw_frames_ctx = hwFramesCtxRef;
-}
-
 UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24(
     UniqueAVFrame& avFrame) {
   // We need FFmpeg filters to handle those conversion cases which are not
@@ -365,39 +331,10 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
       avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
 }
 
-namespace {
-// Helper function to check if a codec supports CUDA hardware acceleration
-bool codecSupportsCudaHardware(const AVCodec* codec) {
-  const AVCodecHWConfig* config = nullptr;
-  for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr; ++j) {
-    if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
-      return true;
-    }
-  }
-  return false;
-}
-} // namespace
-
 // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9
 // we have to do this because of an FFmpeg bug where hardware decoding is not
 // appropriately set, so we just go off and find the matching codec for the CUDA
 // device
-
-std::optional<const AVCodec*> CudaDeviceInterface::findEncoder(
-    const AVCodecID& codecId) {
-  void* i = nullptr;
-  const AVCodec* codec = nullptr;
-  while ((codec = av_codec_iterate(&i)) != nullptr) {
-    if (codec->id != codecId || !av_codec_is_encoder(codec)) {
-      continue;
-    }
-    if (codecSupportsCudaHardware(codec)) {
-      return codec;
-    }
-  }
-  return std::nullopt;
-}
-
 std::optional<const AVCodec*> CudaDeviceInterface::findDecoder(
     const AVCodecID& codecId) {
   void* i = nullptr;
@@ -407,52 +344,18 @@ std::optional<const AVCodec*> CudaDeviceInterface::findDecoder(
       continue;
     }
 
-    if (codecSupportsCudaHardware(codec)) {
-      return codec;
+    const AVCodecHWConfig* config = nullptr;
+    for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr;
+         ++j) {
+      if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
+        return codec;
+      }
     }
   }
 
   return std::nullopt;
 }
 
-UniqueAVFrame CudaDeviceInterface::convertTensorToAVFrame(
-    const torch::Tensor& frame,
-    [[maybe_unused]] AVPixelFormat targetFormat,
-    int frameIndex,
-    AVCodecContext* codecContext) {
-  TORCH_CHECK(frame.is_cuda(), "CUDA device interface requires CUDA tensors");
-  TORCH_CHECK(
-      frame.dim() == 3 && frame.size(0) == 3,
-      "Expected 3D RGB tensor (CHW format), got shape: ",
-      frame.sizes());
-
-  UniqueAVFrame avFrame(av_frame_alloc());
-  TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
-
-  avFrame->format = AV_PIX_FMT_CUDA;
-  avFrame->width = static_cast<int>(frame.size(2));
-  avFrame->height = static_cast<int>(frame.size(1));
-  avFrame->pts = frameIndex;
-
-  int ret = av_hwframe_get_buffer(
-      codecContext ? codecContext->hw_frames_ctx : nullptr, avFrame.get(), 0);
-  TORCH_CHECK(
-      ret >= 0,
-      "Failed to allocate hardware frame: ",
-      getFFMPEGErrorStringFromErrorCode(ret));
-
-  at::cuda::CUDAStream currentStream =
-      at::cuda::getCurrentCUDAStream(device_.index());
-
-  convertRGBTensorToNV12Frame(frame, avFrame, device_, nppCtx_, currentStream);
-
-  // Set color properties to FFmpeg defaults
-  avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
-  avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
-
-  return avFrame;
-}
-
 std::string CudaDeviceInterface::getDetails() {
   // Note: for this interface specifically the fallback is only known after a
   // frame has been decoded, not before: that's when FFmpeg decides to fallback,
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -18,7 +18,6 @@ class CudaDeviceInterface : public DeviceInterface {
 
   virtual ~CudaDeviceInterface();
 
-  std::optional<const AVCodec*> findEncoder(const AVCodecID& codecId) override;
   std::optional<const AVCodec*> findDecoder(const AVCodecID& codecId) override;
 
   void initialize(
@@ -35,19 +34,11 @@ class CudaDeviceInterface : public DeviceInterface {
 
   void registerHardwareDeviceWithCodec(AVCodecContext* codecContext) override;
 
-  void setupEncodingContext(AVCodecContext* codecContext) override;
-
   void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
-  UniqueAVFrame convertTensorToAVFrame(
-      const torch::Tensor& tensor,
-      AVPixelFormat targetFormat,
-      int frameIndex,
-      AVCodecContext* codecContext) override;
-
   std::string getDetails() override;
 
  private:
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -46,11 +46,6 @@ class DeviceInterface {
     return device_;
   };
 
-  virtual std::optional<const AVCodec*> findEncoder(
-      [[maybe_unused]] const AVCodecID& codecId) {
-    return std::nullopt;
-  };
-
   virtual std::optional<const AVCodec*> findDecoder(
       [[maybe_unused]] const AVCodecID& codecId) {
     return std::nullopt;
@@ -92,25 +87,11 @@ class DeviceInterface {
   virtual void registerHardwareDeviceWithCodec(
       [[maybe_unused]] AVCodecContext* codecContext) {}
 
-  // Setup device-specific encoding context (e.g., hardware frame contexts).
-  // Called after registerHardwareDeviceWithCodec for encoders.
-  // Default implementation does nothing (suitable for CPU and basic cases).
-  virtual void setupEncodingContext(
-      [[maybe_unused]] AVCodecContext* codecContext) {}
-
   virtual void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt) = 0;
 
-  // Convert tensor to AVFrame, implemented per device interface.
-  // This is similar to convertAVFrameToFrameOutput for encoding
-  virtual UniqueAVFrame convertTensorToAVFrame(
-      const torch::Tensor& tensor,
-      AVPixelFormat targetFormat,
-      int frameIndex,
-      AVCodecContext* codecContext) = 0;
-
   // ------------------------------------------
   // Extension points for custom decoding paths
   // ------------------------------------------
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ function(make_torchcodec_libraries`
`100`	`100`	`)`
`101`	`101`
`102`	`102`	`if(ENABLE_CUDA)`
`103`		`- list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)`
	`103`	`+ list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp GpuEncoder.cpp)`
`104`	`104`	`endif()`
`105`	`105`
`106`	`106`	`set(core_library_dependencies`