Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 148 additions & 17 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,12 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
return true;
}

// Callback for freeing CUDA memory associated with AVFrame see where it's used
// for more details.
void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
cudaFree(opaque);
}

} // namespace

BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
Expand Down Expand Up @@ -668,38 +674,163 @@ void BetaCudaDeviceInterface::flush() {
std::swap(readyFrames_, emptyQueue);
}

UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
UniqueAVFrame& cpuFrame) {
// This is called in the context of the CPU fallback: the frame was decoded on
// the CPU, and in this function we convert that frame into NV12 format and
// send it to the GPU.
// We do that in 2 steps:
// - First we convert the input CPU frame into an intermediate NV12 CPU frame
// using sws_scale.
// - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
// is what we return

TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");

int width = cpuFrame->width;
int height = cpuFrame->height;

// intermediate NV12 CPU frame. It's not on the GPU yet.
UniqueAVFrame nv12CpuFrame(av_frame_alloc());
TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");

nv12CpuFrame->format = AV_PIX_FMT_NV12;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it accurate to say that NV12 very similar to AV_PIX_FMT_YUV420P (uses YUV, and has 4:2:0 chroma subsampling), but we use NV12 here because that is the format the NPP library requires? As explained in this comment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that's exactly right. NV12 would contain the exact same values as AV_PIX_FMT_YUV420P, just ordered a bit differently.

nv12CpuFrame->width = width;
nv12CpuFrame->height = height;

int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0);
TORCH_CHECK(
ret >= 0,
"Failed to allocate NV12 CPU frame buffer: ",
getFFMPEGErrorStringFromErrorCode(ret));

SwsFrameContext swsFrameContext(
width,
height,
static_cast<AVPixelFormat>(cpuFrame->format),
width,
height);

if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
swsContext_ = createSwsContext(
swsFrameContext, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR);
prevSwsFrameContext_ = swsFrameContext;
}

int convertedHeight = sws_scale(
swsContext_.get(),
cpuFrame->data,
cpuFrame->linesize,
0,
height,
nv12CpuFrame->data,
nv12CpuFrame->linesize);
TORCH_CHECK(
convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");

int ySize = width * height;
TORCH_CHECK(
ySize % 2 == 0,
"Y plane size must be even. Please report on TorchCodec repo.");
int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Integer rounding is okay? This is implicitly a floor operation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching this, I forgot to look into it. Will report back.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I decided to go the easy route and just TORCH_CHECK for evenness. I think frame dimension are always expected to be even anyway.

size_t totalSize = static_cast<size_t>(ySize + uvSize);

uint8_t* cudaBuffer = nullptr;
cudaError_t err =
cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
TORCH_CHECK(
err == cudaSuccess,
"Failed to allocate CUDA memory: ",
cudaGetErrorString(err));

UniqueAVFrame gpuFrame(av_frame_alloc());
TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame");

gpuFrame->format = AV_PIX_FMT_CUDA;
gpuFrame->width = width;
gpuFrame->height = height;
gpuFrame->data[0] = cudaBuffer;
gpuFrame->data[1] = cudaBuffer + ySize;
gpuFrame->linesize[0] = width;
gpuFrame->linesize[1] = width;

// Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
// linesizes (strides) may be different than the widths for the input CPU
// frame. That's precisely what cudaMemcpy2D is for.
err = cudaMemcpy2D(
gpuFrame->data[0],
gpuFrame->linesize[0],
nv12CpuFrame->data[0],
nv12CpuFrame->linesize[0],
width,
height,
cudaMemcpyHostToDevice);
TORCH_CHECK(
err == cudaSuccess,
"Failed to copy Y plane to GPU: ",
cudaGetErrorString(err));

TORCH_CHECK(
height % 2 == 0,
"height must be even. Please report on TorchCodec repo.");
err = cudaMemcpy2D(
gpuFrame->data[1],
gpuFrame->linesize[1],
nv12CpuFrame->data[1],
nv12CpuFrame->linesize[1],
width,
height / 2,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto for integer rounding - okay here?

cudaMemcpyHostToDevice);
TORCH_CHECK(
err == cudaSuccess,
"Failed to copy UV plane to GPU: ",
cudaGetErrorString(err));

ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get());
TORCH_CHECK(
ret >= 0,
"Failed to copy frame properties: ",
getFFMPEGErrorStringFromErrorCode(ret));

// We're almost done, but we need to make sure the CUDA memory is freed
// properly. Usually, AVFrame data is freed when av_frame_free() is called
// (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
// ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
// with this is to associate the opaque_ref field of the AVFrame with a `free`
// callback that will then be called by av_frame_free().
gpuFrame->opaque_ref = av_buffer_create(
nullptr, // data - we don't need any
0, // data size
cudaBufferFreeCallback, // callback triggered by av_frame_free()
cudaBuffer, // parameter to callback
0); // flags
TORCH_CHECK(
gpuFrame->opaque_ref != nullptr,
"Failed to create GPU memory cleanup reference");

return gpuFrame;
}

void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
UniqueAVFrame& avFrame,
FrameOutput& frameOutput,
std::optional<torch::Tensor> preAllocatedOutputTensor) {
if (cpuFallback_) {
// CPU decoded frame - need to do CPU color conversion then transfer to GPU
FrameOutput cpuFrameOutput;
cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);

// Transfer CPU frame to GPU
if (preAllocatedOutputTensor.has_value()) {
preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
frameOutput.data = preAllocatedOutputTensor.value();
} else {
frameOutput.data = cpuFrameOutput.data.to(device_);
}
return;
}
UniqueAVFrame gpuFrame =
cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame);

// TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
// ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
TORCH_CHECK(
avFrame->format == AV_PIX_FMT_CUDA,
gpuFrame->format == AV_PIX_FMT_CUDA,
"Expected CUDA format frame from BETA CUDA interface");

validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame);
validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuFrame);

at::cuda::CUDAStream nvdecStream =
at::cuda::getCurrentCUDAStream(device_.index());

frameOutput.data = convertNV12FrameToRGB(
avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
}

std::string BetaCudaDeviceInterface::getDetails() {
Expand Down
4 changes: 4 additions & 0 deletions src/torchcodec/_core/BetaCudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
unsigned int pitch,
const CUVIDPARSERDISPINFO& dispInfo);

UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame);

CUvideoparser videoParser_ = nullptr;
UniqueCUvideodecoder decoder_;
CUVIDEOFORMAT videoFormat_ = {};
Expand All @@ -99,6 +101,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {

std::unique_ptr<DeviceInterface> cpuFallback_;
bool nvcuvidAvailable_ = false;
UniqueSwsContext swsContext_;
SwsFrameContext prevSwsFrameContext_;
};

} // namespace facebook::torchcodec
Expand Down
72 changes: 2 additions & 70 deletions src/torchcodec/_core/CpuDeviceInterface.cpp
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As can be seen above we are now using swscale in BetaCudaInterface.cpp to do the YUV -> NV12 conversion. So I had to extract out all the swscale logic away from CpuDeviceInteface.cpp and put it into FFmpegCommon.cpp so it can be reused across the two interfaces. Almost everything below this comment can be ignored and treated as copy/pasting code around. Just pay attention to the test at the bottom.

Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,6 @@ static bool g_cpu = registerDeviceInterface(

} // namespace

CpuDeviceInterface::SwsFrameContext::SwsFrameContext(
int inputWidth,
int inputHeight,
AVPixelFormat inputFormat,
int outputWidth,
int outputHeight)
: inputWidth(inputWidth),
inputHeight(inputHeight),
inputFormat(inputFormat),
outputWidth(outputWidth),
outputHeight(outputHeight) {}

bool CpuDeviceInterface::SwsFrameContext::operator==(
const CpuDeviceInterface::SwsFrameContext& other) const {
return inputWidth == other.inputWidth && inputHeight == other.inputHeight &&
inputFormat == other.inputFormat && outputWidth == other.outputWidth &&
outputHeight == other.outputHeight;
}

bool CpuDeviceInterface::SwsFrameContext::operator!=(
const CpuDeviceInterface::SwsFrameContext& other) const {
return !(*this == other);
}

CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
: DeviceInterface(device) {
TORCH_CHECK(g_cpu, "CpuDeviceInterface was not registered!");
Expand Down Expand Up @@ -257,7 +233,8 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
outputDims.height);

if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
createSwsContext(swsFrameContext, avFrame->colorspace);
swsContext_ = createSwsContext(
swsFrameContext, avFrame->colorspace, AV_PIX_FMT_RGB24, swsFlags_);
prevSwsFrameContext_ = swsFrameContext;
}

Expand All @@ -276,51 +253,6 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
return resultHeight;
}

void CpuDeviceInterface::createSwsContext(
const SwsFrameContext& swsFrameContext,
const enum AVColorSpace colorspace) {
SwsContext* swsContext = sws_getContext(
swsFrameContext.inputWidth,
swsFrameContext.inputHeight,
swsFrameContext.inputFormat,
swsFrameContext.outputWidth,
swsFrameContext.outputHeight,
AV_PIX_FMT_RGB24,
swsFlags_,
nullptr,
nullptr,
nullptr);
TORCH_CHECK(swsContext, "sws_getContext() returned nullptr");

int* invTable = nullptr;
int* table = nullptr;
int srcRange, dstRange, brightness, contrast, saturation;
int ret = sws_getColorspaceDetails(
swsContext,
&invTable,
&srcRange,
&table,
&dstRange,
&brightness,
&contrast,
&saturation);
TORCH_CHECK(ret != -1, "sws_getColorspaceDetails returned -1");

const int* colorspaceTable = sws_getCoefficients(colorspace);
ret = sws_setColorspaceDetails(
swsContext,
colorspaceTable,
srcRange,
colorspaceTable,
dstRange,
brightness,
contrast,
saturation);
TORCH_CHECK(ret != -1, "sws_setColorspaceDetails returned -1");

swsContext_.reset(swsContext);
}

torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
const UniqueAVFrame& avFrame,
const FrameDims& outputDims) {
Expand Down
22 changes: 0 additions & 22 deletions src/torchcodec/_core/CpuDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,28 +54,6 @@ class CpuDeviceInterface : public DeviceInterface {
ColorConversionLibrary getColorConversionLibrary(
const FrameDims& inputFrameDims) const;

struct SwsFrameContext {
int inputWidth = 0;
int inputHeight = 0;
AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
int outputWidth = 0;
int outputHeight = 0;

SwsFrameContext() = default;
SwsFrameContext(
int inputWidth,
int inputHeight,
AVPixelFormat inputFormat,
int outputWidth,
int outputHeight);
bool operator==(const SwsFrameContext&) const;
bool operator!=(const SwsFrameContext&) const;
};

void createSwsContext(
const SwsFrameContext& swsFrameContext,
const enum AVColorSpace colorspace);

VideoStreamOptions videoStreamOptions_;
AVRational timeBase_;

Expand Down
Loading
Loading