Skip to content

Commit 9b81ede

Browse files
committed
remove GpuEncoder, use minimal deviceInterface implem
1 parent 6b8c1fe commit 9b81ede

File tree

8 files changed

+160
-279
lines changed

8 files changed

+160
-279
lines changed

src/torchcodec/_core/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ function(make_torchcodec_libraries
100100
)
101101

102102
if(ENABLE_CUDA)
103-
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp GpuEncoder.cpp)
103+
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)
104104
endif()
105105

106106
set(core_library_dependencies

src/torchcodec/_core/CudaDeviceInterface.cpp

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,4 +362,125 @@ std::string CudaDeviceInterface::getDetails() {
362362
(usingCPUFallback_ ? "CPU fallback." : "NVDEC.");
363363
}
364364

365+
// Below are methods for video encoding:
366+
namespace {
367+
// RGB to NV12 color conversion matrix for BT.601 limited range.
368+
// NPP ColorTwist function used below expects the limited range
369+
// color conversion matrix, and this matches FFmpeg's default behavior.
370+
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
371+
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
372+
{0.257f, 0.504f, 0.098f, 16.0f},
373+
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
374+
{-0.148f, -0.291f, 0.439f, 128.0f},
375+
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
376+
{0.439f, -0.368f, -0.071f, 128.0f}};
377+
} // namespace
378+
379+
std::optional<UniqueAVFrame> CudaDeviceInterface::convertTensorToAVFrame(
380+
const torch::Tensor& tensor,
381+
[[maybe_unused]] AVPixelFormat targetFormat,
382+
int frameIndex,
383+
AVCodecContext* codecContext) {
384+
TORCH_CHECK(
385+
tensor.dim() == 3 && tensor.size(0) == 3,
386+
"Expected 3D RGB tensor (CHW format), got shape: ",
387+
tensor.sizes());
388+
389+
UniqueAVFrame avFrame(av_frame_alloc());
390+
TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
391+
int height = static_cast<int>(tensor.size(1));
392+
int width = static_cast<int>(tensor.size(2));
393+
394+
// TODO-VideoEncoder: Unify AVFrame creation with CPU version of this method
395+
avFrame->format = AV_PIX_FMT_CUDA;
396+
avFrame->height = height;
397+
avFrame->width = width;
398+
avFrame->pts = frameIndex;
399+
400+
// FFmpeg's av_hwframe_get_buffer is used to allocate memory on CUDA device.
401+
// TODO-VideoEncoder: Consider using pytorch to allocate CUDA memory for
402+
// efficiency
403+
int ret =
404+
av_hwframe_get_buffer(codecContext->hw_frames_ctx, avFrame.get(), 0);
405+
TORCH_CHECK(
406+
ret >= 0,
407+
"Failed to allocate hardware frame: ",
408+
getFFMPEGErrorStringFromErrorCode(ret));
409+
410+
TORCH_CHECK(
411+
avFrame != nullptr && avFrame->data[0] != nullptr,
412+
"avFrame must be pre-allocated with CUDA memory");
413+
414+
torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
415+
416+
at::cuda::CUDAStream currentStream =
417+
at::cuda::getCurrentCUDAStream(device_.index());
418+
419+
nppCtx_->hStream = currentStream.stream();
420+
cudaError_t cudaErr =
421+
cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
422+
TORCH_CHECK(
423+
cudaErr == cudaSuccess,
424+
"cudaStreamGetFlags failed: ",
425+
cudaGetErrorString(cudaErr));
426+
427+
NppiSize oSizeROI = {width, height};
428+
NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
429+
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
430+
hwcFrame.stride(0) * hwcFrame.element_size(),
431+
avFrame->data,
432+
avFrame->linesize,
433+
oSizeROI,
434+
defaultLimitedRangeRgbToNv12,
435+
*nppCtx_);
436+
437+
TORCH_CHECK(
438+
status == NPP_SUCCESS,
439+
"Failed to convert RGB to NV12: NPP error code ",
440+
status);
441+
442+
// TODO-VideoEncoder: Enable configuration of color properties, similar to
443+
// FFmpeg. Below are the default color properties used by FFmpeg.
444+
avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
445+
avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
446+
447+
return avFrame;
448+
}
449+
450+
void CudaDeviceInterface::setupHardwareFrameContext(
451+
AVCodecContext* codecContext) {
452+
TORCH_CHECK(codecContext != nullptr, "codecContext is null");
453+
TORCH_CHECK(
454+
hardwareDeviceCtx_, "Hardware device context has not been initialized");
455+
456+
AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
457+
TORCH_CHECK(
458+
hwFramesCtxRef != nullptr,
459+
"Failed to allocate hardware frames context for codec");
460+
461+
// Always set pixel formats to options that support CUDA encoding.
462+
// TODO-VideoEncoder: Enable user set pixel formats to be set and properly
463+
// handled with NPP functions below
464+
codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
465+
codecContext->pix_fmt = AV_PIX_FMT_CUDA;
466+
467+
AVHWFramesContext* hwFramesCtx =
468+
reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
469+
hwFramesCtx->format = codecContext->pix_fmt;
470+
hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
471+
hwFramesCtx->width = codecContext->width;
472+
hwFramesCtx->height = codecContext->height;
473+
474+
int ret = av_hwframe_ctx_init(hwFramesCtxRef);
475+
if (ret < 0) {
476+
av_buffer_unref(&hwFramesCtxRef);
477+
TORCH_CHECK(
478+
false,
479+
"Failed to initialize CUDA frames context for codec: ",
480+
getFFMPEGErrorStringFromErrorCode(ret));
481+
}
482+
483+
codecContext->hw_frames_ctx = hwFramesCtxRef;
484+
}
485+
365486
} // namespace facebook::torchcodec

src/torchcodec/_core/CudaDeviceInterface.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ class CudaDeviceInterface : public DeviceInterface {
4141

4242
std::string getDetails() override;
4343

44+
std::optional<UniqueAVFrame> convertTensorToAVFrame(
45+
const torch::Tensor& tensor,
46+
AVPixelFormat targetFormat,
47+
int frameIndex,
48+
AVCodecContext* codecContext) override;
49+
50+
void setupHardwareFrameContext(AVCodecContext* codecContext) override;
51+
4452
private:
4553
// Our CUDA decoding code assumes NV12 format. In order to handle other
4654
// kinds of input, we need to convert them to NV12. Our current implementation

src/torchcodec/_core/DeviceInterface.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,19 @@ class DeviceInterface {
138138
return "";
139139
}
140140

141+
// Function used for video encoding, only implemented in CudaDeviceInterface.
142+
virtual std::optional<UniqueAVFrame> convertTensorToAVFrame(
143+
[[maybe_unused]] const torch::Tensor& tensor,
144+
[[maybe_unused]] AVPixelFormat targetFormat,
145+
[[maybe_unused]] int frameIndex,
146+
[[maybe_unused]] AVCodecContext* codecContext) {
147+
return std::nullopt;
148+
}
149+
150+
// Function used for video encoding, only implemented in CudaDeviceInterface.
151+
virtual void setupHardwareFrameContext(
152+
[[maybe_unused]] AVCodecContext* codecContext) {}
153+
141154
protected:
142155
torch::Device device_;
143156
SharedAVCodecContext codecContext_;

src/torchcodec/_core/Encoder.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "torch/types.h"
66

77
extern "C" {
8+
#include <libavutil/hwcontext.h>
89
#include <libavutil/opt.h>
910
#include <libavutil/pixdesc.h>
1011
}
@@ -724,8 +725,10 @@ VideoEncoder::VideoEncoder(
724725

725726
void VideoEncoder::initializeEncoder(
726727
const VideoStreamOptions& videoStreamOptions) {
728+
// Only create device interface when frames are on a CUDA device.
729+
// Encoding on CPU is implemented in this file.
727730
if (frames_.device().is_cuda()) {
728-
gpuEncoder_ = std::make_unique<GpuEncoder>(frames_.device());
731+
deviceInterface_ = createDeviceInterface(frames_.device());
729732
}
730733
const AVCodec* avCodec = nullptr;
731734
// If codec arg is provided, find codec using logic similar to FFmpeg:
@@ -824,9 +827,9 @@ void VideoEncoder::initializeEncoder(
824827
0);
825828
}
826829

827-
if (gpuEncoder_) {
828-
gpuEncoder_->registerHardwareDeviceWithCodec(avCodecContext_.get());
829-
gpuEncoder_->setupHardwareFrameContext(avCodecContext_.get());
830+
if (frames_.device().is_cuda()) {
831+
deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
832+
deviceInterface_->setupHardwareFrameContext(avCodecContext_.get());
830833
}
831834

832835
int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
@@ -870,9 +873,16 @@ void VideoEncoder::encode() {
870873
for (int i = 0; i < numFrames; ++i) {
871874
torch::Tensor currFrame = frames_[i];
872875
UniqueAVFrame avFrame;
873-
if (gpuEncoder_) {
874-
avFrame = gpuEncoder_->convertTensorToAVFrame(
876+
if (deviceInterface_) {
877+
auto cudaFrame = deviceInterface_->convertTensorToAVFrame(
875878
currFrame, outPixelFormat_, i, avCodecContext_.get());
879+
TORCH_CHECK(
880+
cudaFrame.has_value(),
881+
"convertTensorToAVFrame failed for frame ",
882+
i,
883+
"on device: ",
884+
frames_.device());
885+
avFrame = std::move(*cudaFrame);
876886
} else {
877887
avFrame = convertTensorToAVFrame(currFrame, i);
878888
}

src/torchcodec/_core/Encoder.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include "AVIOContextHolder.h"
66
#include "DeviceInterface.h"
77
#include "FFMPEGCommon.h"
8-
#include "GpuEncoder.h"
98
#include "StreamOptions.h"
109

1110
extern "C" {
@@ -185,7 +184,7 @@ class VideoEncoder {
185184
AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
186185

187186
std::unique_ptr<AVIOContextHolder> avioContextHolder_;
188-
std::unique_ptr<GpuEncoder> gpuEncoder_;
187+
std::unique_ptr<DeviceInterface> deviceInterface_;
189188

190189
bool encodeWasCalled_ = false;
191190
AVDictionary* avFormatOptions_ = nullptr;

0 commit comments

Comments
 (0)