|
9 | 9 | #include <mutex> |
10 | 10 | #include <vector> |
11 | 11 |
|
12 | | -#include "src/torchcodec/_core/BetaCudaDeviceInterface.h" |
| 12 | +#include "BetaCudaDeviceInterface.h" |
13 | 13 |
|
14 | | -#include "src/torchcodec/_core/DeviceInterface.h" |
15 | | -#include "src/torchcodec/_core/FFMPEGCommon.h" |
16 | | -#include "src/torchcodec/_core/NVDECCache.h" |
| 14 | +#include "DeviceInterface.h" |
| 15 | +#include "FFMPEGCommon.h" |
| 16 | +#include "NVDECCache.h" |
17 | 17 |
|
18 | | -#include "src/torchcodec/_core/NVCUVIDRuntimeLoader.h" |
19 | | -#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h" |
20 | | -#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h" |
| 18 | +#include "NVCUVIDRuntimeLoader.h" |
| 19 | +#include "nvcuvid_include/cuviddec.h" |
| 20 | +#include "nvcuvid_include/nvcuvid.h" |
21 | 21 |
|
22 | 22 | extern "C" { |
23 | 23 | #include <libavutil/hwcontext_cuda.h> |
@@ -213,6 +213,12 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { |
213 | 213 | return true; |
214 | 214 | } |
215 | 215 |
|
| 216 | +// Callback for freeing CUDA memory associated with AVFrame see where it's used |
| 217 | +// for more details. |
| 218 | +void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) { |
| 219 | + cudaFree(opaque); |
| 220 | +} |
| 221 | + |
216 | 222 | } // namespace |
217 | 223 |
|
218 | 224 | BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device) |
@@ -668,38 +674,163 @@ void BetaCudaDeviceInterface::flush() { |
668 | 674 | std::swap(readyFrames_, emptyQueue); |
669 | 675 | } |
670 | 676 |
|
| 677 | +UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( |
| 678 | + UniqueAVFrame& cpuFrame) { |
| 679 | + // This is called in the context of the CPU fallback: the frame was decoded on |
| 680 | + // the CPU, and in this function we convert that frame into NV12 format and |
| 681 | + // send it to the GPU. |
| 682 | + // We do that in 2 steps: |
| 683 | + // - First we convert the input CPU frame into an intermediate NV12 CPU frame |
| 684 | + // using sws_scale. |
| 685 | + // - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This |
| 686 | + // is what we return |
| 687 | + |
| 688 | + TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null"); |
| 689 | + |
| 690 | + int width = cpuFrame->width; |
| 691 | + int height = cpuFrame->height; |
| 692 | + |
| 693 | + // intermediate NV12 CPU frame. It's not on the GPU yet. |
| 694 | + UniqueAVFrame nv12CpuFrame(av_frame_alloc()); |
| 695 | + TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame"); |
| 696 | + |
| 697 | + nv12CpuFrame->format = AV_PIX_FMT_NV12; |
| 698 | + nv12CpuFrame->width = width; |
| 699 | + nv12CpuFrame->height = height; |
| 700 | + |
| 701 | + int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0); |
| 702 | + TORCH_CHECK( |
| 703 | + ret >= 0, |
| 704 | + "Failed to allocate NV12 CPU frame buffer: ", |
| 705 | + getFFMPEGErrorStringFromErrorCode(ret)); |
| 706 | + |
| 707 | + SwsFrameContext swsFrameContext( |
| 708 | + width, |
| 709 | + height, |
| 710 | + static_cast<AVPixelFormat>(cpuFrame->format), |
| 711 | + width, |
| 712 | + height); |
| 713 | + |
| 714 | + if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) { |
| 715 | + swsContext_ = createSwsContext( |
| 716 | + swsFrameContext, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR); |
| 717 | + prevSwsFrameContext_ = swsFrameContext; |
| 718 | + } |
| 719 | + |
| 720 | + int convertedHeight = sws_scale( |
| 721 | + swsContext_.get(), |
| 722 | + cpuFrame->data, |
| 723 | + cpuFrame->linesize, |
| 724 | + 0, |
| 725 | + height, |
| 726 | + nv12CpuFrame->data, |
| 727 | + nv12CpuFrame->linesize); |
| 728 | + TORCH_CHECK( |
| 729 | + convertedHeight == height, "sws_scale failed for CPU->NV12 conversion"); |
| 730 | + |
| 731 | + int ySize = width * height; |
| 732 | + TORCH_CHECK( |
| 733 | + ySize % 2 == 0, |
| 734 | + "Y plane size must be even. Please report on TorchCodec repo."); |
| 735 | + int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane |
| 736 | + size_t totalSize = static_cast<size_t>(ySize + uvSize); |
| 737 | + |
| 738 | + uint8_t* cudaBuffer = nullptr; |
| 739 | + cudaError_t err = |
| 740 | + cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize); |
| 741 | + TORCH_CHECK( |
| 742 | + err == cudaSuccess, |
| 743 | + "Failed to allocate CUDA memory: ", |
| 744 | + cudaGetErrorString(err)); |
| 745 | + |
| 746 | + UniqueAVFrame gpuFrame(av_frame_alloc()); |
| 747 | + TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame"); |
| 748 | + |
| 749 | + gpuFrame->format = AV_PIX_FMT_CUDA; |
| 750 | + gpuFrame->width = width; |
| 751 | + gpuFrame->height = height; |
| 752 | + gpuFrame->data[0] = cudaBuffer; |
| 753 | + gpuFrame->data[1] = cudaBuffer + ySize; |
| 754 | + gpuFrame->linesize[0] = width; |
| 755 | + gpuFrame->linesize[1] = width; |
| 756 | + |
| 757 | + // Note that we use cudaMemcpy2D here instead of cudaMemcpy because the |
| 758 | + // linesizes (strides) may be different than the widths for the input CPU |
| 759 | + // frame. That's precisely what cudaMemcpy2D is for. |
| 760 | + err = cudaMemcpy2D( |
| 761 | + gpuFrame->data[0], |
| 762 | + gpuFrame->linesize[0], |
| 763 | + nv12CpuFrame->data[0], |
| 764 | + nv12CpuFrame->linesize[0], |
| 765 | + width, |
| 766 | + height, |
| 767 | + cudaMemcpyHostToDevice); |
| 768 | + TORCH_CHECK( |
| 769 | + err == cudaSuccess, |
| 770 | + "Failed to copy Y plane to GPU: ", |
| 771 | + cudaGetErrorString(err)); |
| 772 | + |
| 773 | + TORCH_CHECK( |
| 774 | + height % 2 == 0, |
| 775 | + "height must be even. Please report on TorchCodec repo."); |
| 776 | + err = cudaMemcpy2D( |
| 777 | + gpuFrame->data[1], |
| 778 | + gpuFrame->linesize[1], |
| 779 | + nv12CpuFrame->data[1], |
| 780 | + nv12CpuFrame->linesize[1], |
| 781 | + width, |
| 782 | + height / 2, |
| 783 | + cudaMemcpyHostToDevice); |
| 784 | + TORCH_CHECK( |
| 785 | + err == cudaSuccess, |
| 786 | + "Failed to copy UV plane to GPU: ", |
| 787 | + cudaGetErrorString(err)); |
| 788 | + |
| 789 | + ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get()); |
| 790 | + TORCH_CHECK( |
| 791 | + ret >= 0, |
| 792 | + "Failed to copy frame properties: ", |
| 793 | + getFFMPEGErrorStringFromErrorCode(ret)); |
| 794 | + |
| 795 | + // We're almost done, but we need to make sure the CUDA memory is freed |
| 796 | + // properly. Usually, AVFrame data is freed when av_frame_free() is called |
| 797 | + // (upon UniqueAVFrame destruction), but since we allocated the CUDA memory |
| 798 | + // ourselves, FFmpeg doesn't know how to free it. The recommended way to deal |
| 799 | + // with this is to associate the opaque_ref field of the AVFrame with a `free` |
| 800 | + // callback that will then be called by av_frame_free(). |
| 801 | + gpuFrame->opaque_ref = av_buffer_create( |
| 802 | + nullptr, // data - we don't need any |
| 803 | + 0, // data size |
| 804 | + cudaBufferFreeCallback, // callback triggered by av_frame_free() |
| 805 | + cudaBuffer, // parameter to callback |
| 806 | + 0); // flags |
| 807 | + TORCH_CHECK( |
| 808 | + gpuFrame->opaque_ref != nullptr, |
| 809 | + "Failed to create GPU memory cleanup reference"); |
| 810 | + |
| 811 | + return gpuFrame; |
| 812 | +} |
| 813 | + |
671 | 814 | void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( |
672 | 815 | UniqueAVFrame& avFrame, |
673 | 816 | FrameOutput& frameOutput, |
674 | 817 | std::optional<torch::Tensor> preAllocatedOutputTensor) { |
675 | | - if (cpuFallback_) { |
676 | | - // CPU decoded frame - need to do CPU color conversion then transfer to GPU |
677 | | - FrameOutput cpuFrameOutput; |
678 | | - cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput); |
679 | | - |
680 | | - // Transfer CPU frame to GPU |
681 | | - if (preAllocatedOutputTensor.has_value()) { |
682 | | - preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data); |
683 | | - frameOutput.data = preAllocatedOutputTensor.value(); |
684 | | - } else { |
685 | | - frameOutput.data = cpuFrameOutput.data.to(device_); |
686 | | - } |
687 | | - return; |
688 | | - } |
| 818 | + UniqueAVFrame gpuFrame = |
| 819 | + cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame); |
689 | 820 |
|
690 | 821 | // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA |
691 | 822 | // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24(). |
692 | 823 | TORCH_CHECK( |
693 | | - avFrame->format == AV_PIX_FMT_CUDA, |
| 824 | + gpuFrame->format == AV_PIX_FMT_CUDA, |
694 | 825 | "Expected CUDA format frame from BETA CUDA interface"); |
695 | 826 |
|
696 | | - validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame); |
| 827 | + validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuFrame); |
697 | 828 |
|
698 | 829 | at::cuda::CUDAStream nvdecStream = |
699 | 830 | at::cuda::getCurrentCUDAStream(device_.index()); |
700 | 831 |
|
701 | 832 | frameOutput.data = convertNV12FrameToRGB( |
702 | | - avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor); |
| 833 | + gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor); |
703 | 834 | } |
704 | 835 |
|
705 | 836 | std::string BetaCudaDeviceInterface::getDetails() { |
|
0 commit comments