@@ -212,6 +212,12 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
212212 return true ;
213213}
214214
215+ // Callback for freeing CUDA memory associated with AVFrame see where it's used
216+ // for more details.
217+ void cudaBufferFreeCallback (void * opaque, [[maybe_unused]] uint8_t * data) {
218+ cudaFree (opaque);
219+ }
220+
215221} // namespace
216222
217223BetaCudaDeviceInterface::BetaCudaDeviceInterface (const torch::Device& device)
@@ -665,20 +671,23 @@ void BetaCudaDeviceInterface::flush() {
665671 std::swap (readyFrames_, emptyQueue);
666672}
667673
668- namespace {
669- // Cleanup callback for CUDA memory allocated for GPU frames
670- void cudaBufferFreeCallback (void * opaque, [[maybe_unused]] uint8_t * data) {
671- cudaFree (opaque);
672- }
673- } // namespace
674-
675674UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12 (
676675 UniqueAVFrame& cpuFrame) {
676+ // This is called in the context of the CPU fallback: the frame was decoded on
677+ // the CPU, and in this function we convert that frame into NV12 format and
678+ // send it to the GPU.
679+ // We do that in 2 steps:
680+ // - First we convert the input CPU frame into an intermediate NV12 CPU frame
681+ // using sws_scale.
682+ // - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
683+ // is what we return
684+
677685 TORCH_CHECK (cpuFrame != nullptr , " CPU frame cannot be null" );
678686
679687 int width = cpuFrame->width ;
680688 int height = cpuFrame->height ;
681689
690+ // intermediate NV12 CPU frame. It's not on the GPU yet.
682691 UniqueAVFrame nv12CpuFrame (av_frame_alloc ());
683692 TORCH_CHECK (nv12CpuFrame != nullptr , " Failed to allocate NV12 CPU frame" );
684693
@@ -707,7 +716,7 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
707716
708717 int convertedHeight = sws_scale (
709718 swsContext_.get (),
710- const_cast < const uint8_t * const *>( cpuFrame->data ) ,
719+ cpuFrame->data ,
711720 cpuFrame->linesize ,
712721 0 ,
713722 height,
@@ -739,6 +748,9 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
739748 gpuFrame->linesize [0 ] = width;
740749 gpuFrame->linesize [1 ] = width;
741750
751+ // Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
752+ // linesizes (strides) may be different than the widths for the input CPU
753+ // frame. That's precisely what cudaMemcpy2D is for.
742754 err = cudaMemcpy2D (
743755 gpuFrame->data [0 ],
744756 gpuFrame->linesize [0 ],
@@ -771,10 +783,16 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
771783 " Failed to copy frame properties: " ,
772784 getFFMPEGErrorStringFromErrorCode (ret));
773785
786+ // We're almost done, but we need to make sure the CUDA memory is freed
787+ // properly. Usually, AVFrame data is freed when av_frame_free() is called
788+ // (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
789+ // ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
790+ // with this is to associate the opaque_ref field of the AVFrame with a `free`
791+ // callback that will then be called by av_frame_free().
774792 gpuFrame->opaque_ref = av_buffer_create (
775- nullptr , // data
793+ nullptr , // data - we don't need any
776794 0 , // data size
777- cudaBufferFreeCallback, // callback triggered by av_frame_free()
795+ cudaBufferFreeCallback, // callback triggered by av_frame_free()
778796 cudaBuffer, // parameter to callback
779797 0 ); // flags
780798 TORCH_CHECK (
0 commit comments