@@ -674,149 +674,13 @@ void BetaCudaDeviceInterface::flush() {
674674 std::swap (readyFrames_, emptyQueue);
675675}
676676
677- UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12 (
678- UniqueAVFrame& cpuFrame) {
679- // This is called in the context of the CPU fallback: the frame was decoded on
680- // the CPU, and in this function we convert that frame into NV12 format and
681- // send it to the GPU.
682- // We do that in 2 steps:
683- // - First we convert the input CPU frame into an intermediate NV12 CPU frame
684- // using sws_scale.
685- // - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
686- // is what we return
687-
688- TORCH_CHECK (cpuFrame != nullptr , " CPU frame cannot be null" );
689-
690- int width = cpuFrame->width ;
691- int height = cpuFrame->height ;
692-
693- // intermediate NV12 CPU frame. It's not on the GPU yet.
694- UniqueAVFrame nv12CpuFrame (av_frame_alloc ());
695- TORCH_CHECK (nv12CpuFrame != nullptr , " Failed to allocate NV12 CPU frame" );
696-
697- nv12CpuFrame->format = AV_PIX_FMT_NV12;
698- nv12CpuFrame->width = width;
699- nv12CpuFrame->height = height;
700-
701- int ret = av_frame_get_buffer (nv12CpuFrame.get (), 0 );
702- TORCH_CHECK (
703- ret >= 0 ,
704- " Failed to allocate NV12 CPU frame buffer: " ,
705- getFFMPEGErrorStringFromErrorCode (ret));
706-
707- SwsFrameContext swsFrameContext (
708- width,
709- height,
710- static_cast <AVPixelFormat>(cpuFrame->format ),
711- width,
712- height);
713-
714- if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
715- swsContext_ = createSwsContext (
716- swsFrameContext, cpuFrame->colorspace , AV_PIX_FMT_NV12, SWS_BILINEAR);
717- prevSwsFrameContext_ = swsFrameContext;
718- }
719-
720- int convertedHeight = sws_scale (
721- swsContext_.get (),
722- cpuFrame->data ,
723- cpuFrame->linesize ,
724- 0 ,
725- height,
726- nv12CpuFrame->data ,
727- nv12CpuFrame->linesize );
728- TORCH_CHECK (
729- convertedHeight == height, " sws_scale failed for CPU->NV12 conversion" );
730-
731- int ySize = width * height;
732- TORCH_CHECK (
733- ySize % 2 == 0 ,
734- " Y plane size must be even. Please report on TorchCodec repo." );
735- int uvSize = ySize / 2 ; // NV12: UV plane is half the size of Y plane
736- size_t totalSize = static_cast <size_t >(ySize + uvSize);
737-
738- uint8_t * cudaBuffer = nullptr ;
739- cudaError_t err =
740- cudaMalloc (reinterpret_cast <void **>(&cudaBuffer), totalSize);
741- TORCH_CHECK (
742- err == cudaSuccess,
743- " Failed to allocate CUDA memory: " ,
744- cudaGetErrorString (err));
745-
746- UniqueAVFrame gpuFrame (av_frame_alloc ());
747- TORCH_CHECK (gpuFrame != nullptr , " Failed to allocate GPU AVFrame" );
748-
749- gpuFrame->format = AV_PIX_FMT_CUDA;
750- gpuFrame->width = width;
751- gpuFrame->height = height;
752- gpuFrame->data [0 ] = cudaBuffer;
753- gpuFrame->data [1 ] = cudaBuffer + ySize;
754- gpuFrame->linesize [0 ] = width;
755- gpuFrame->linesize [1 ] = width;
756-
757- // Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
758- // linesizes (strides) may be different than the widths for the input CPU
759- // frame. That's precisely what cudaMemcpy2D is for.
760- err = cudaMemcpy2D (
761- gpuFrame->data [0 ],
762- gpuFrame->linesize [0 ],
763- nv12CpuFrame->data [0 ],
764- nv12CpuFrame->linesize [0 ],
765- width,
766- height,
767- cudaMemcpyHostToDevice);
768- TORCH_CHECK (
769- err == cudaSuccess,
770- " Failed to copy Y plane to GPU: " ,
771- cudaGetErrorString (err));
772-
773- TORCH_CHECK (
774- height % 2 == 0 ,
775- " height must be even. Please report on TorchCodec repo." );
776- err = cudaMemcpy2D (
777- gpuFrame->data [1 ],
778- gpuFrame->linesize [1 ],
779- nv12CpuFrame->data [1 ],
780- nv12CpuFrame->linesize [1 ],
781- width,
782- height / 2 ,
783- cudaMemcpyHostToDevice);
784- TORCH_CHECK (
785- err == cudaSuccess,
786- " Failed to copy UV plane to GPU: " ,
787- cudaGetErrorString (err));
788-
789- ret = av_frame_copy_props (gpuFrame.get (), cpuFrame.get ());
790- TORCH_CHECK (
791- ret >= 0 ,
792- " Failed to copy frame properties: " ,
793- getFFMPEGErrorStringFromErrorCode (ret));
794-
795- // We're almost done, but we need to make sure the CUDA memory is freed
796- // properly. Usually, AVFrame data is freed when av_frame_free() is called
797- // (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
798- // ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
799- // with this is to associate the opaque_ref field of the AVFrame with a `free`
800- // callback that will then be called by av_frame_free().
801- gpuFrame->opaque_ref = av_buffer_create (
802- nullptr , // data - we don't need any
803- 0 , // data size
804- cudaBufferFreeCallback, // callback triggered by av_frame_free()
805- cudaBuffer, // parameter to callback
806- 0 ); // flags
807- TORCH_CHECK (
808- gpuFrame->opaque_ref != nullptr ,
809- " Failed to create GPU memory cleanup reference" );
810-
811- return gpuFrame;
812- }
813-
814677void BetaCudaDeviceInterface::convertAVFrameToFrameOutput (
815678 UniqueAVFrame& avFrame,
816679 FrameOutput& frameOutput,
817680 std::optional<torch::Tensor> preAllocatedOutputTensor) {
818681 UniqueAVFrame gpuFrame =
819- cpuFallback_ ? transferCpuFrameToGpuNV12 (avFrame) : std::move (avFrame);
682+ cpuFallback_ ? transferCpuFrameToGpuNV12 (avFrame, swsCtx_, device_)
683+ : std::move (avFrame);
820684
821685 // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
822686 // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
0 commit comments