-
Notifications
You must be signed in to change notification settings - Fork 75
CPU fallback: do color-conversion on GPU. #992
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bf3e29b
d86a19e
7f88e1b
5c61a96
c6bda33
f4c8f4e
340974a
3afc97f
89f1547
042a35e
61d4c7f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -213,6 +213,12 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { | |
| return true; | ||
| } | ||
|
|
||
| // Callback for freeing CUDA memory associated with AVFrame see where it's used | ||
| // for more details. | ||
| void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) { | ||
| cudaFree(opaque); | ||
| } | ||
|
|
||
| } // namespace | ||
|
|
||
| BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device) | ||
|
|
@@ -668,38 +674,163 @@ void BetaCudaDeviceInterface::flush() { | |
| std::swap(readyFrames_, emptyQueue); | ||
| } | ||
|
|
||
| UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( | ||
| UniqueAVFrame& cpuFrame) { | ||
| // This is called in the context of the CPU fallback: the frame was decoded on | ||
| // the CPU, and in this function we convert that frame into NV12 format and | ||
| // send it to the GPU. | ||
| // We do that in 2 steps: | ||
| // - First we convert the input CPU frame into an intermediate NV12 CPU frame | ||
| // using sws_scale. | ||
| // - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This | ||
| // is what we return | ||
|
|
||
| TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null"); | ||
|
|
||
| int width = cpuFrame->width; | ||
| int height = cpuFrame->height; | ||
|
|
||
| // intermediate NV12 CPU frame. It's not on the GPU yet. | ||
| UniqueAVFrame nv12CpuFrame(av_frame_alloc()); | ||
| TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame"); | ||
|
|
||
| nv12CpuFrame->format = AV_PIX_FMT_NV12; | ||
| nv12CpuFrame->width = width; | ||
| nv12CpuFrame->height = height; | ||
|
|
||
| int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0); | ||
| TORCH_CHECK( | ||
| ret >= 0, | ||
| "Failed to allocate NV12 CPU frame buffer: ", | ||
| getFFMPEGErrorStringFromErrorCode(ret)); | ||
|
|
||
| SwsFrameContext swsFrameContext( | ||
| width, | ||
| height, | ||
| static_cast<AVPixelFormat>(cpuFrame->format), | ||
| width, | ||
| height); | ||
|
|
||
| if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) { | ||
| swsContext_ = createSwsContext( | ||
| swsFrameContext, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR); | ||
| prevSwsFrameContext_ = swsFrameContext; | ||
| } | ||
|
|
||
| int convertedHeight = sws_scale( | ||
| swsContext_.get(), | ||
| cpuFrame->data, | ||
| cpuFrame->linesize, | ||
| 0, | ||
| height, | ||
| nv12CpuFrame->data, | ||
| nv12CpuFrame->linesize); | ||
| TORCH_CHECK( | ||
| convertedHeight == height, "sws_scale failed for CPU->NV12 conversion"); | ||
|
|
||
| int ySize = width * height; | ||
| TORCH_CHECK( | ||
| ySize % 2 == 0, | ||
| "Y plane size must be even. Please report on TorchCodec repo."); | ||
| int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Integer rounding is okay? This is implicitly a floor operation.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for catching this, I forgot to look into it. Will report back.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I decided to go the easy route and just |
||
| size_t totalSize = static_cast<size_t>(ySize + uvSize); | ||
|
|
||
| uint8_t* cudaBuffer = nullptr; | ||
| cudaError_t err = | ||
| cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize); | ||
| TORCH_CHECK( | ||
| err == cudaSuccess, | ||
| "Failed to allocate CUDA memory: ", | ||
| cudaGetErrorString(err)); | ||
|
|
||
| UniqueAVFrame gpuFrame(av_frame_alloc()); | ||
| TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame"); | ||
|
|
||
| gpuFrame->format = AV_PIX_FMT_CUDA; | ||
| gpuFrame->width = width; | ||
| gpuFrame->height = height; | ||
| gpuFrame->data[0] = cudaBuffer; | ||
| gpuFrame->data[1] = cudaBuffer + ySize; | ||
| gpuFrame->linesize[0] = width; | ||
| gpuFrame->linesize[1] = width; | ||
|
|
||
| // Note that we use cudaMemcpy2D here instead of cudaMemcpy because the | ||
| // linesizes (strides) may be different than the widths for the input CPU | ||
| // frame. That's precisely what cudaMemcpy2D is for. | ||
| err = cudaMemcpy2D( | ||
| gpuFrame->data[0], | ||
| gpuFrame->linesize[0], | ||
| nv12CpuFrame->data[0], | ||
| nv12CpuFrame->linesize[0], | ||
| width, | ||
| height, | ||
| cudaMemcpyHostToDevice); | ||
| TORCH_CHECK( | ||
| err == cudaSuccess, | ||
| "Failed to copy Y plane to GPU: ", | ||
| cudaGetErrorString(err)); | ||
|
|
||
| TORCH_CHECK( | ||
| height % 2 == 0, | ||
| "height must be even. Please report on TorchCodec repo."); | ||
| err = cudaMemcpy2D( | ||
| gpuFrame->data[1], | ||
| gpuFrame->linesize[1], | ||
| nv12CpuFrame->data[1], | ||
| nv12CpuFrame->linesize[1], | ||
| width, | ||
| height / 2, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto for integer rounding - okay here? |
||
| cudaMemcpyHostToDevice); | ||
| TORCH_CHECK( | ||
| err == cudaSuccess, | ||
| "Failed to copy UV plane to GPU: ", | ||
| cudaGetErrorString(err)); | ||
|
|
||
| ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get()); | ||
| TORCH_CHECK( | ||
| ret >= 0, | ||
| "Failed to copy frame properties: ", | ||
| getFFMPEGErrorStringFromErrorCode(ret)); | ||
|
|
||
| // We're almost done, but we need to make sure the CUDA memory is freed | ||
| // properly. Usually, AVFrame data is freed when av_frame_free() is called | ||
| // (upon UniqueAVFrame destruction), but since we allocated the CUDA memory | ||
| // ourselves, FFmpeg doesn't know how to free it. The recommended way to deal | ||
| // with this is to associate the opaque_ref field of the AVFrame with a `free` | ||
| // callback that will then be called by av_frame_free(). | ||
| gpuFrame->opaque_ref = av_buffer_create( | ||
| nullptr, // data - we don't need any | ||
| 0, // data size | ||
| cudaBufferFreeCallback, // callback triggered by av_frame_free() | ||
| cudaBuffer, // parameter to callback | ||
| 0); // flags | ||
| TORCH_CHECK( | ||
| gpuFrame->opaque_ref != nullptr, | ||
| "Failed to create GPU memory cleanup reference"); | ||
|
|
||
| return gpuFrame; | ||
| } | ||
|
|
||
| void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( | ||
| UniqueAVFrame& avFrame, | ||
| FrameOutput& frameOutput, | ||
| std::optional<torch::Tensor> preAllocatedOutputTensor) { | ||
| if (cpuFallback_) { | ||
| // CPU decoded frame - need to do CPU color conversion then transfer to GPU | ||
| FrameOutput cpuFrameOutput; | ||
| cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput); | ||
|
|
||
| // Transfer CPU frame to GPU | ||
| if (preAllocatedOutputTensor.has_value()) { | ||
| preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data); | ||
| frameOutput.data = preAllocatedOutputTensor.value(); | ||
| } else { | ||
| frameOutput.data = cpuFrameOutput.data.to(device_); | ||
| } | ||
| return; | ||
| } | ||
| UniqueAVFrame gpuFrame = | ||
| cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame); | ||
|
|
||
| // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA | ||
| // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24(). | ||
| TORCH_CHECK( | ||
| avFrame->format == AV_PIX_FMT_CUDA, | ||
| gpuFrame->format == AV_PIX_FMT_CUDA, | ||
| "Expected CUDA format frame from BETA CUDA interface"); | ||
|
|
||
| validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame); | ||
| validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuFrame); | ||
|
|
||
| at::cuda::CUDAStream nvdecStream = | ||
| at::cuda::getCurrentCUDAStream(device_.index()); | ||
|
|
||
| frameOutput.data = convertNV12FrameToRGB( | ||
| avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor); | ||
| gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor); | ||
| } | ||
|
|
||
| std::string BetaCudaDeviceInterface::getDetails() { | ||
|
|
||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As can be seen above we are now using |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it accurate to say that NV12 very similar to
AV_PIX_FMT_YUV420P(uses YUV, and has 4:2:0 chroma subsampling), but we use NV12 here because that is the format the NPP library requires? As explained in this commentThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes that's exactly right. NV12 would contain the exact same values as
AV_PIX_FMT_YUV420P, just ordered a bit differently.