Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,16 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
}

UniqueAVFrame BetaCudaDeviceInterface::convertTensorToAVFrame(
[[maybe_unused]] const torch::Tensor& tensor,
[[maybe_unused]] AVPixelFormat targetFormat,
[[maybe_unused]] int frameIndex,
[[maybe_unused]] AVCodecContext* codecContext) {
TORCH_CHECK(
false,
"Beta CUDA device interface does not support video encoding currently.");
}

std::string BetaCudaDeviceInterface::getDetails() {
std::string details = "Beta CUDA Device Interface.";
if (cpuFallback_) {
Expand Down
6 changes: 6 additions & 0 deletions src/torchcodec/_core/BetaCudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ class BetaCudaDeviceInterface : public DeviceInterface {
FrameOutput& frameOutput,
std::optional<torch::Tensor> preAllocatedOutputTensor) override;

UniqueAVFrame convertTensorToAVFrame(
const torch::Tensor& tensor,
AVPixelFormat targetFormat,
int frameIndex,
AVCodecContext* codecContext) override;

int sendPacket(ReferenceAVPacket& packet) override;
int sendEOFPacket() override;
int receiveFrame(UniqueAVFrame& avFrame) override;
Expand Down
77 changes: 77 additions & 0 deletions src/torchcodec/_core/CUDACommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,21 @@ const Npp32f bt709FullRangeColorTwist[3][4] = {
{1.0f, -0.187324273f, -0.468124273f, -128.0f},
{1.0f, 1.8556f, 0.0f, -128.0f}};

// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
// Note: NPP's ColorTwist function apparently expects "limited range"
// coefficient format even when producing full range output. All matrices below
// use the limited range coefficient format (Y with +16 offset) for NPP
// compatibility.

// BT.601 limited range (matches FFmpeg default behavior)
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
{0.257f, 0.504f, 0.098f, 16.0f},
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
{-0.148f, -0.291f, 0.439f, 128.0f},
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
{0.439f, -0.368f, -0.071f, 128.0f}};

torch::Tensor convertNV12FrameToRGB(
UniqueAVFrame& avFrame,
const torch::Device& device,
Expand Down Expand Up @@ -246,6 +261,68 @@ torch::Tensor convertNV12FrameToRGB(
return dst;
}

void convertRGBTensorToNV12Frame(
const torch::Tensor& rgbTensor,
UniqueAVFrame& nv12Frame,
const torch::Device& device,
const UniqueNppContext& nppCtx,
at::cuda::CUDAStream inputStream) {
TORCH_CHECK(rgbTensor.is_cuda(), "RGB tensor must be on CUDA device");
TORCH_CHECK(
rgbTensor.dim() == 3 && rgbTensor.size(0) == 3,
"Expected 3D RGB tensor in CHW format, got shape: ",
rgbTensor.sizes());
TORCH_CHECK(
nv12Frame != nullptr && nv12Frame->data[0] != nullptr,
"nv12Frame must be pre-allocated with CUDA memory");

// Convert CHW to HWC for NPP processing
int height = static_cast<int>(rgbTensor.size(1));
int width = static_cast<int>(rgbTensor.size(2));
torch::Tensor hwcFrame = rgbTensor.permute({1, 2, 0}).contiguous();

// Set up stream synchronization - make NPP stream wait for input tensor
// operations
at::cuda::CUDAStream nppStream =
at::cuda::getCurrentCUDAStream(device.index());
at::cuda::CUDAEvent inputDoneEvent;
inputDoneEvent.record(inputStream);
inputDoneEvent.block(nppStream);

// Setup NPP context
nppCtx->hStream = nppStream.stream();
cudaError_t cudaErr =
cudaStreamGetFlags(nppCtx->hStream, &nppCtx->nStreamFlags);
TORCH_CHECK(
cudaErr == cudaSuccess,
"cudaStreamGetFlags failed: ",
cudaGetErrorString(cudaErr));

// Always use FFmpeg's default behavior: BT.601 limited range
NppiSize oSizeROI = {width, height};

NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
hwcFrame.stride(0) * hwcFrame.element_size(),
nv12Frame->data,
nv12Frame->linesize,
oSizeROI,
defaultLimitedRangeRgbToNv12,
*nppCtx);

TORCH_CHECK(
status == NPP_SUCCESS,
"Failed to convert RGB to NV12: NPP error code ",
status);

// Validate CUDA operations completed successfully
cudaError_t memCheck = cudaGetLastError();
TORCH_CHECK(
memCheck == cudaSuccess,
"CUDA error detected: ",
cudaGetErrorString(memCheck));
}

UniqueNppContext getNppStreamContext(const torch::Device& device) {
int deviceIndex = getDeviceIndex(device);

Expand Down
7 changes: 7 additions & 0 deletions src/torchcodec/_core/CUDACommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ torch::Tensor convertNV12FrameToRGB(
at::cuda::CUDAStream nvdecStream,
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);

void convertRGBTensorToNV12Frame(
const torch::Tensor& rgbTensor,
UniqueAVFrame& nv12Frame,
const torch::Device& device,
const UniqueNppContext& nppCtx,
at::cuda::CUDAStream inputStream);

UniqueNppContext getNppStreamContext(const torch::Device& device);
void returnNppStreamContextToCache(
const torch::Device& device,
Expand Down
78 changes: 78 additions & 0 deletions src/torchcodec/_core/CpuDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,84 @@ std::optional<torch::Tensor> CpuDeviceInterface::maybeFlushAudioBuffers() {
/*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples);
}

UniqueAVFrame CpuDeviceInterface::convertTensorToAVFrame(
const torch::Tensor& frame,
AVPixelFormat outPixelFormat,
int frameIndex,
[[maybe_unused]] AVCodecContext* codecContext) {
int inHeight = static_cast<int>(frame.sizes()[1]);
int inWidth = static_cast<int>(frame.sizes()[2]);

// For now, reuse input dimensions as output dimensions
int outWidth = inWidth;
int outHeight = inHeight;

// Input format is RGB planar (AV_PIX_FMT_GBRP after channel reordering)
AVPixelFormat inPixelFormat = AV_PIX_FMT_GBRP;

// Initialize and cache scaling context if it does not exist
if (!swsContext_) {
swsContext_.reset(sws_getContext(
inWidth,
inHeight,
inPixelFormat,
outWidth,
outHeight,
outPixelFormat,
SWS_BICUBIC, // Used by FFmpeg CLI
nullptr,
nullptr,
nullptr));
TORCH_CHECK(swsContext_ != nullptr, "Failed to create scaling context");
}

UniqueAVFrame avFrame(av_frame_alloc());
TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");

// Set output frame properties
avFrame->format = outPixelFormat;
avFrame->width = outWidth;
avFrame->height = outHeight;
avFrame->pts = frameIndex;

int status = av_frame_get_buffer(avFrame.get(), 0);
TORCH_CHECK(status >= 0, "Failed to allocate frame buffer");

// Need to convert/scale the frame
// Create temporary frame with input format
UniqueAVFrame inputFrame(av_frame_alloc());
TORCH_CHECK(inputFrame != nullptr, "Failed to allocate input AVFrame");

inputFrame->format = inPixelFormat;
inputFrame->width = inWidth;
inputFrame->height = inHeight;

uint8_t* tensorData = static_cast<uint8_t*>(frame.data_ptr());

// TODO-VideoEncoder: Reorder tensor if in NHWC format
int channelSize = inHeight * inWidth;
// Reorder RGB -> GBR for AV_PIX_FMT_GBRP format
// TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format
inputFrame->data[0] = tensorData + channelSize;
inputFrame->data[1] = tensorData + (2 * channelSize);
inputFrame->data[2] = tensorData;

inputFrame->linesize[0] = inWidth;
inputFrame->linesize[1] = inWidth;
inputFrame->linesize[2] = inWidth;

status = sws_scale(
swsContext_.get(),
inputFrame->data,
inputFrame->linesize,
0,
inputFrame->height,
avFrame->data,
avFrame->linesize);
TORCH_CHECK(status == outHeight, "sws_scale failed");
return avFrame;
}

std::string CpuDeviceInterface::getDetails() {
return std::string("CPU Device Interface.");
}
Expand Down
11 changes: 6 additions & 5 deletions src/torchcodec/_core/CpuDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,6 @@ class CpuDeviceInterface : public DeviceInterface {

virtual ~CpuDeviceInterface() {}

std::optional<const AVCodec*> findCodec(
[[maybe_unused]] const AVCodecID& codecId) override {
return std::nullopt;
}

virtual void initialize(
const AVStream* avStream,
const UniqueDecodingAVFormatContext& avFormatCtx,
Expand All @@ -43,6 +38,12 @@ class CpuDeviceInterface : public DeviceInterface {
FrameOutput& frameOutput,
std::optional<torch::Tensor> preAllocatedOutputTensor) override;

UniqueAVFrame convertTensorToAVFrame(
const torch::Tensor& tensor,
AVPixelFormat targetFormat,
int frameIndex,
AVCodecContext* codecContext) override;

std::string getDetails() override;

private:
Expand Down
113 changes: 106 additions & 7 deletions src/torchcodec/_core/CudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#include <ATen/cuda/CUDAEvent.h>
#include <c10/cuda/CUDAStream.h>
#include <cuda_runtime.h>
#include <torch/types.h>
#include <mutex>

#include "CUDACommon.h"
#include "Cache.h"
#include "CudaDeviceInterface.h"
#include "FFMPEGCommon.h"
Expand Down Expand Up @@ -144,6 +146,40 @@ void CudaDeviceInterface::registerHardwareDeviceWithCodec(
codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
}

void CudaDeviceInterface::setupEncodingContext(AVCodecContext* codecContext) {
TORCH_CHECK(
hardwareDeviceCtx_, "Hardware device context has not been initialized");
TORCH_CHECK(codecContext != nullptr, "codecContext is null");
// is there any way to preserve actual desired format?
// codecContext->sw_pix_fmt = codecContext->pix_fmt;
// Should we always produce AV_PIX_FMT_NV12?
codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
codecContext->pix_fmt = AV_PIX_FMT_CUDA;

AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
TORCH_CHECK(
hwFramesCtxRef != nullptr,
"Failed to allocate hardware frames context for codec");

AVHWFramesContext* hwFramesCtx =
reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
hwFramesCtx->format = codecContext->pix_fmt;
hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
hwFramesCtx->width = codecContext->width;
hwFramesCtx->height = codecContext->height;

int ret = av_hwframe_ctx_init(hwFramesCtxRef);
if (ret < 0) {
av_buffer_unref(&hwFramesCtxRef);
TORCH_CHECK(
false,
"Failed to initialize CUDA frames context for codec: ",
getFFMPEGErrorStringFromErrorCode(ret));
}

codecContext->hw_frames_ctx = hwFramesCtxRef;
}

UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24(
UniqueAVFrame& avFrame) {
// We need FFmpeg filters to handle those conversion cases which are not
Expand Down Expand Up @@ -329,11 +365,40 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
}

namespace {
// Helper function to check if a codec supports CUDA hardware acceleration
bool codecSupportsCudaHardware(const AVCodec* codec) {
const AVCodecHWConfig* config = nullptr;
for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr; ++j) {
if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
return true;
}
}
return false;
}
} // namespace

// inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9
// we have to do this because of an FFmpeg bug where hardware decoding is not
// appropriately set, so we just go off and find the matching codec for the CUDA
// device
std::optional<const AVCodec*> CudaDeviceInterface::findCodec(

std::optional<const AVCodec*> CudaDeviceInterface::findEncoder(
const AVCodecID& codecId) {
void* i = nullptr;
const AVCodec* codec = nullptr;
while ((codec = av_codec_iterate(&i)) != nullptr) {
if (codec->id != codecId || !av_codec_is_encoder(codec)) {
continue;
}
if (codecSupportsCudaHardware(codec)) {
return codec;
}
}
return std::nullopt;
}

std::optional<const AVCodec*> CudaDeviceInterface::findDecoder(
const AVCodecID& codecId) {
void* i = nullptr;
const AVCodec* codec = nullptr;
Expand All @@ -342,18 +407,52 @@ std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
continue;
}

const AVCodecHWConfig* config = nullptr;
for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr;
++j) {
if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
return codec;
}
if (codecSupportsCudaHardware(codec)) {
return codec;
}
}

return std::nullopt;
}

UniqueAVFrame CudaDeviceInterface::convertTensorToAVFrame(
const torch::Tensor& frame,
[[maybe_unused]] AVPixelFormat targetFormat,
int frameIndex,
AVCodecContext* codecContext) {
TORCH_CHECK(frame.is_cuda(), "CUDA device interface requires CUDA tensors");
TORCH_CHECK(
frame.dim() == 3 && frame.size(0) == 3,
"Expected 3D RGB tensor (CHW format), got shape: ",
frame.sizes());

UniqueAVFrame avFrame(av_frame_alloc());
TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");

avFrame->format = AV_PIX_FMT_CUDA;
avFrame->width = static_cast<int>(frame.size(2));
avFrame->height = static_cast<int>(frame.size(1));
avFrame->pts = frameIndex;

int ret = av_hwframe_get_buffer(
codecContext ? codecContext->hw_frames_ctx : nullptr, avFrame.get(), 0);
TORCH_CHECK(
ret >= 0,
"Failed to allocate hardware frame: ",
getFFMPEGErrorStringFromErrorCode(ret));

at::cuda::CUDAStream currentStream =
at::cuda::getCurrentCUDAStream(device_.index());

convertRGBTensorToNV12Frame(frame, avFrame, device_, nppCtx_, currentStream);

// Set color properties to FFmpeg defaults
avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range

return avFrame;
}

std::string CudaDeviceInterface::getDetails() {
// Note: for this interface specifically the fallback is only known after a
// frame has been decoded, not before: that's when FFmpeg decides to fallback,
Expand Down
Loading
Loading