From bf3e29ba6859eae5e563da3062257e15f8b9ed4c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Oct 2025 17:43:59 +0100 Subject: [PATCH 01/10] WIP --- .../_core/BetaCudaDeviceInterface.cpp | 112 ++++++++++++++++-- .../_core/BetaCudaDeviceInterface.h | 3 + 2 files changed, 104 insertions(+), 11 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 07ed92126..00f31816f 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -665,22 +665,112 @@ void BetaCudaDeviceInterface::flush() { std::swap(readyFrames_, emptyQueue); } +namespace { +// Cleanup callback for CUDA memory allocated for GPU frames +void cudaBufferFreeCallback(void* opaque, uint8_t* data) { + cudaFree(opaque); +} +} // namespace + +UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( + UniqueAVFrame& cpuFrame) { + TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null"); + + int width = cpuFrame->width; + int height = cpuFrame->height; + + // Step 1: Convert to NV12 on CPU using swscale + UniqueAVFrame nv12CpuFrame(av_frame_alloc()); + TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame"); + + nv12CpuFrame->format = AV_PIX_FMT_NV12; + nv12CpuFrame->width = width; + nv12CpuFrame->height = height; + + int ret = av_frame_get_buffer(nv12CpuFrame.get(), 32); + TORCH_CHECK(ret >= 0, "Failed to allocate NV12 CPU frame buffer: ", + getFFMPEGErrorStringFromErrorCode(ret)); + + UniqueSwsContext swsCtx(sws_getContext( + width, height, static_cast(cpuFrame->format), + width, height, AV_PIX_FMT_NV12, + SWS_BILINEAR, nullptr, nullptr, nullptr)); + TORCH_CHECK(swsCtx != nullptr, "Failed to create SwsContext for CPU->NV12 conversion"); + + int convertedHeight = sws_scale( + swsCtx.get(), + const_cast(cpuFrame->data), cpuFrame->linesize, + 0, height, + nv12CpuFrame->data, nv12CpuFrame->linesize); + TORCH_CHECK(convertedHeight == height, "sws_scale failed for CPU->NV12 conversion"); + + // Step 2: Allocate CUDA memory + int ySize = width * height; + int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane + size_t totalSize = ySize + uvSize; + + uint8_t* cudaBuffer = nullptr; + cudaError_t err = cudaMalloc(reinterpret_cast(&cudaBuffer), totalSize); + TORCH_CHECK(err == cudaSuccess, "Failed to allocate CUDA memory: ", cudaGetErrorString(err)); + + // Step 3: Create GPU frame + UniqueAVFrame gpuFrame(av_frame_alloc()); + TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame"); + + gpuFrame->format = AV_PIX_FMT_CUDA; + gpuFrame->width = width; + gpuFrame->height = height; + gpuFrame->data[0] = cudaBuffer; + gpuFrame->data[1] = cudaBuffer + ySize; + gpuFrame->linesize[0] = width; + gpuFrame->linesize[1] = width; + + // Step 4: Copy data from CPU NV12 to GPU using cudaMemcpy2D for safety + err = cudaMemcpy2D( + gpuFrame->data[0], gpuFrame->linesize[0], + nv12CpuFrame->data[0], nv12CpuFrame->linesize[0], + width, height, + cudaMemcpyHostToDevice); + TORCH_CHECK(err == cudaSuccess, "Failed to copy Y plane to GPU: ", cudaGetErrorString(err)); + + err = cudaMemcpy2D( + gpuFrame->data[1], gpuFrame->linesize[1], + nv12CpuFrame->data[1], nv12CpuFrame->linesize[1], + width, height / 2, + cudaMemcpyHostToDevice); + TORCH_CHECK(err == cudaSuccess, "Failed to copy UV plane to GPU: ", cudaGetErrorString(err)); + + // Step 5: Set up proper GPU memory cleanup using AVFrame's reference counting + ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get()); + TORCH_CHECK(ret >= 0, "Failed to copy frame properties: ", + getFFMPEGErrorStringFromErrorCode(ret)); + + // Create a buffer reference that will automatically free CUDA memory when frame is destroyed + gpuFrame->opaque_ref = av_buffer_create( + reinterpret_cast(cudaBuffer), 0, // size=0 since we're not using the data pointer + cudaBufferFreeCallback, + cudaBuffer, // pass the actual CUDA buffer as opaque data + 0); + TORCH_CHECK(gpuFrame->opaque_ref != nullptr, "Failed to create GPU memory cleanup reference"); + + return gpuFrame; +} + void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) { if (cpuFallback_) { - // CPU decoded frame - need to do CPU color conversion then transfer to GPU - FrameOutput cpuFrameOutput; - cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput); - - // Transfer CPU frame to GPU - if (preAllocatedOutputTensor.has_value()) { - preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data); - frameOutput.data = preAllocatedOutputTensor.value(); - } else { - frameOutput.data = cpuFrameOutput.data.to(device_); - } + // CPU decoded frame - convert to GPU NV12 and do GPU color conversion + UniqueAVFrame gpuNV12Frame = transferCpuFrameToGpuNV12(avFrame); + + validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuNV12Frame); + + at::cuda::CUDAStream nvdecStream = + at::cuda::getCurrentCUDAStream(device_.index()); + + frameOutput.data = convertNV12FrameToRGB( + gpuNV12Frame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor); return; } diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index 3a9520867..a1f913e6c 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -81,6 +81,9 @@ class BetaCudaDeviceInterface : public DeviceInterface { unsigned int pitch, const CUVIDPARSERDISPINFO& dispInfo); + // Convert CPU frame to GPU NV12 frame for GPU color conversion + UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame); + CUvideoparser videoParser_ = nullptr; UniqueCUvideodecoder decoder_; CUVIDEOFORMAT videoFormat_ = {}; From d86a19ec9bac3338935a225c29db773de0aba28a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Oct 2025 18:16:43 +0100 Subject: [PATCH 02/10] WIP --- src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 00f31816f..cd3f21136 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -667,7 +667,7 @@ void BetaCudaDeviceInterface::flush() { namespace { // Cleanup callback for CUDA memory allocated for GPU frames -void cudaBufferFreeCallback(void* opaque, uint8_t* data) { +void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) { cudaFree(opaque); } } // namespace From 7f88e1bd7acfc50f48306b8ca8907917d6ec3cf6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Oct 2025 19:08:47 +0100 Subject: [PATCH 03/10] WIP --- .../_core/BetaCudaDeviceInterface.cpp | 21 +++++-------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index cd3f21136..76bd9b112 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -760,33 +760,22 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) { - if (cpuFallback_) { - // CPU decoded frame - convert to GPU NV12 and do GPU color conversion - UniqueAVFrame gpuNV12Frame = transferCpuFrameToGpuNV12(avFrame); - - validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuNV12Frame); - - at::cuda::CUDAStream nvdecStream = - at::cuda::getCurrentCUDAStream(device_.index()); - - frameOutput.data = convertNV12FrameToRGB( - gpuNV12Frame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor); - return; - } + // Convert CPU frame to GPU NV12 if using CPU fallback, otherwise use existing GPU frame + UniqueAVFrame gpuFrame = cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame); // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24(). TORCH_CHECK( - avFrame->format == AV_PIX_FMT_CUDA, + gpuFrame->format == AV_PIX_FMT_CUDA, "Expected CUDA format frame from BETA CUDA interface"); - validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame); + validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuFrame); at::cuda::CUDAStream nvdecStream = at::cuda::getCurrentCUDAStream(device_.index()); frameOutput.data = convertNV12FrameToRGB( - avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor); + gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor); } std::string BetaCudaDeviceInterface::getDetails() { From 5c61a96c1c04e7dcc40f53da37548014da6ecdaf Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Oct 2025 19:58:23 +0100 Subject: [PATCH 04/10] WIP --- .../_core/BetaCudaDeviceInterface.cpp | 22 +++--- .../_core/BetaCudaDeviceInterface.h | 4 ++ src/torchcodec/_core/CpuDeviceInterface.cpp | 72 +------------------ src/torchcodec/_core/CpuDeviceInterface.h | 21 ------ src/torchcodec/_core/FFMPEGCommon.cpp | 69 ++++++++++++++++++ src/torchcodec/_core/FFMPEGCommon.h | 27 +++++++ test/test_decoders.py | 7 +- 7 files changed, 122 insertions(+), 100 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 76bd9b112..a08309752 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -215,7 +215,7 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { } // namespace BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device) - : DeviceInterface(device) { + : DeviceInterface(device), prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) { TORCH_CHECK(g_cuda_beta, "BetaCudaDeviceInterface was not registered!"); TORCH_CHECK( device_.type() == torch::kCUDA, "Unsupported device: ", device_.str()); @@ -679,7 +679,7 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( int width = cpuFrame->width; int height = cpuFrame->height; - // Step 1: Convert to NV12 on CPU using swscale + // Step 1: Convert to NV12 on CPU using cached swscale context UniqueAVFrame nv12CpuFrame(av_frame_alloc()); TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame"); @@ -687,18 +687,23 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( nv12CpuFrame->width = width; nv12CpuFrame->height = height; - int ret = av_frame_get_buffer(nv12CpuFrame.get(), 32); + int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0); TORCH_CHECK(ret >= 0, "Failed to allocate NV12 CPU frame buffer: ", getFFMPEGErrorStringFromErrorCode(ret)); - UniqueSwsContext swsCtx(sws_getContext( + // Create or reuse swscale context using caching logic + SwsFrameContext swsFrameContext( width, height, static_cast(cpuFrame->format), - width, height, AV_PIX_FMT_NV12, - SWS_BILINEAR, nullptr, nullptr, nullptr)); - TORCH_CHECK(swsCtx != nullptr, "Failed to create SwsContext for CPU->NV12 conversion"); + width, height); + + if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) { + swsContext_ = createSwsContext( + swsFrameContext, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR); + prevSwsFrameContext_ = swsFrameContext; + } int convertedHeight = sws_scale( - swsCtx.get(), + swsContext_.get(), const_cast(cpuFrame->data), cpuFrame->linesize, 0, height, nv12CpuFrame->data, nv12CpuFrame->linesize); @@ -760,7 +765,6 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) { - // Convert CPU frame to GPU NV12 if using CPU fallback, otherwise use existing GPU frame UniqueAVFrame gpuFrame = cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame); // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index a1f913e6c..1fc97453c 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -100,6 +100,10 @@ class BetaCudaDeviceInterface : public DeviceInterface { // NPP context for color conversion UniqueNppContext nppCtx_; + // Swscale context caching for CPU->GPU NV12 conversion + UniqueSwsContext swsContext_; + SwsFrameContext prevSwsFrameContext_; + std::unique_ptr cpuFallback_; }; diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp index 5aa20b09e..329f5af93 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.cpp +++ b/src/torchcodec/_core/CpuDeviceInterface.cpp @@ -15,32 +15,9 @@ static bool g_cpu = registerDeviceInterface( } // namespace -CpuDeviceInterface::SwsFrameContext::SwsFrameContext( - int inputWidth, - int inputHeight, - AVPixelFormat inputFormat, - int outputWidth, - int outputHeight) - : inputWidth(inputWidth), - inputHeight(inputHeight), - inputFormat(inputFormat), - outputWidth(outputWidth), - outputHeight(outputHeight) {} - -bool CpuDeviceInterface::SwsFrameContext::operator==( - const CpuDeviceInterface::SwsFrameContext& other) const { - return inputWidth == other.inputWidth && inputHeight == other.inputHeight && - inputFormat == other.inputFormat && outputWidth == other.outputWidth && - outputHeight == other.outputHeight; -} - -bool CpuDeviceInterface::SwsFrameContext::operator!=( - const CpuDeviceInterface::SwsFrameContext& other) const { - return !(*this == other); -} CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device) - : DeviceInterface(device) { + : DeviceInterface(device), prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) { TORCH_CHECK(g_cpu, "CpuDeviceInterface was not registered!"); TORCH_CHECK( device_.type() == torch::kCPU, "Unsupported device: ", device_.str()); @@ -257,7 +234,8 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale( outputDims.height); if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) { - createSwsContext(swsFrameContext, avFrame->colorspace); + swsContext_ = createSwsContext( + swsFrameContext, avFrame->colorspace, AV_PIX_FMT_RGB24, swsFlags_); prevSwsFrameContext_ = swsFrameContext; } @@ -276,50 +254,6 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale( return resultHeight; } -void CpuDeviceInterface::createSwsContext( - const SwsFrameContext& swsFrameContext, - const enum AVColorSpace colorspace) { - SwsContext* swsContext = sws_getContext( - swsFrameContext.inputWidth, - swsFrameContext.inputHeight, - swsFrameContext.inputFormat, - swsFrameContext.outputWidth, - swsFrameContext.outputHeight, - AV_PIX_FMT_RGB24, - swsFlags_, - nullptr, - nullptr, - nullptr); - TORCH_CHECK(swsContext, "sws_getContext() returned nullptr"); - - int* invTable = nullptr; - int* table = nullptr; - int srcRange, dstRange, brightness, contrast, saturation; - int ret = sws_getColorspaceDetails( - swsContext, - &invTable, - &srcRange, - &table, - &dstRange, - &brightness, - &contrast, - &saturation); - TORCH_CHECK(ret != -1, "sws_getColorspaceDetails returned -1"); - - const int* colorspaceTable = sws_getCoefficients(colorspace); - ret = sws_setColorspaceDetails( - swsContext, - colorspaceTable, - srcRange, - colorspaceTable, - dstRange, - brightness, - contrast, - saturation); - TORCH_CHECK(ret != -1, "sws_setColorspaceDetails returned -1"); - - swsContext_.reset(swsContext); -} torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph( const UniqueAVFrame& avFrame, diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h index 3f6f7c962..4c6e38698 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.h +++ b/src/torchcodec/_core/CpuDeviceInterface.h @@ -54,27 +54,6 @@ class CpuDeviceInterface : public DeviceInterface { ColorConversionLibrary getColorConversionLibrary( const FrameDims& inputFrameDims) const; - struct SwsFrameContext { - int inputWidth = 0; - int inputHeight = 0; - AVPixelFormat inputFormat = AV_PIX_FMT_NONE; - int outputWidth = 0; - int outputHeight = 0; - - SwsFrameContext() = default; - SwsFrameContext( - int inputWidth, - int inputHeight, - AVPixelFormat inputFormat, - int outputWidth, - int outputHeight); - bool operator==(const SwsFrameContext&) const; - bool operator!=(const SwsFrameContext&) const; - }; - - void createSwsContext( - const SwsFrameContext& swsFrameContext, - const enum AVColorSpace colorspace); VideoStreamOptions videoStreamOptions_; AVRational timeBase_; diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp index 97ff082e1..b9663d8d2 100644 --- a/src/torchcodec/_core/FFMPEGCommon.cpp +++ b/src/torchcodec/_core/FFMPEGCommon.cpp @@ -605,4 +605,73 @@ int64_t computeSafeDuration( } } +SwsFrameContext::SwsFrameContext( + int inputWidth, + int inputHeight, + AVPixelFormat inputFormat, + int outputWidth, + int outputHeight) + : inputWidth(inputWidth), + inputHeight(inputHeight), + inputFormat(inputFormat), + outputWidth(outputWidth), + outputHeight(outputHeight) {} + +bool SwsFrameContext::operator==(const SwsFrameContext& other) const { + return inputWidth == other.inputWidth && inputHeight == other.inputHeight && + inputFormat == other.inputFormat && outputWidth == other.outputWidth && + outputHeight == other.outputHeight; +} + +bool SwsFrameContext::operator!=(const SwsFrameContext& other) const { + return !(*this == other); +} + +UniqueSwsContext createSwsContext( + const SwsFrameContext& swsFrameContext, + AVColorSpace colorspace, + AVPixelFormat outputFormat, + int swsFlags) { + SwsContext* swsContext = sws_getContext( + swsFrameContext.inputWidth, + swsFrameContext.inputHeight, + swsFrameContext.inputFormat, + swsFrameContext.outputWidth, + swsFrameContext.outputHeight, + outputFormat, + swsFlags, + nullptr, + nullptr, + nullptr); + TORCH_CHECK(swsContext, "sws_getContext() returned nullptr"); + + int* invTable = nullptr; + int* table = nullptr; + int srcRange, dstRange, brightness, contrast, saturation; + int ret = sws_getColorspaceDetails( + swsContext, + &invTable, + &srcRange, + &table, + &dstRange, + &brightness, + &contrast, + &saturation); + TORCH_CHECK(ret != -1, "sws_getColorspaceDetails returned -1"); + + const int* colorspaceTable = sws_getCoefficients(colorspace); + ret = sws_setColorspaceDetails( + swsContext, + colorspaceTable, + srcRange, + colorspaceTable, + dstRange, + brightness, + contrast, + saturation); + TORCH_CHECK(ret != -1, "sws_setColorspaceDetails returned -1"); + + return UniqueSwsContext(swsContext); +} + } // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h index 337616ddc..a5c0ff980 100644 --- a/src/torchcodec/_core/FFMPEGCommon.h +++ b/src/torchcodec/_core/FFMPEGCommon.h @@ -9,6 +9,7 @@ #include #include #include +#include extern "C" { #include @@ -103,6 +104,32 @@ using UniqueAVBufferSrcParameters = std::unique_ptr< AVBufferSrcParameters, Deleterv>; +// Common swscale context management for efficient reuse across device interfaces +struct SwsFrameContext { + int inputWidth; + int inputHeight; + AVPixelFormat inputFormat; + int outputWidth; + int outputHeight; + + SwsFrameContext( + int inputWidth, + int inputHeight, + AVPixelFormat inputFormat, + int outputWidth, + int outputHeight); + + bool operator==(const SwsFrameContext& other) const; + bool operator!=(const SwsFrameContext& other) const; +}; + +// Utility functions for swscale context management +UniqueSwsContext createSwsContext( + const SwsFrameContext& swsFrameContext, + AVColorSpace colorspace, + AVPixelFormat outputFormat = AV_PIX_FMT_RGB24, + int swsFlags = SWS_BILINEAR); + // These 2 classes share the same underlying AVPacket object. They are meant to // be used in tandem, like so: // diff --git a/test/test_decoders.py b/test/test_decoders.py index 6e08e05a4..75cd225a2 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1712,8 +1712,13 @@ def test_beta_cuda_interface_cpu_fallback(self): ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) with set_cuda_backend("beta"): beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) + + from torchvision.io import write_png + from torchvision.utils import make_grid + write_png(make_grid([ffmpeg.data, beta.data], nrow=2).cpu(), "out.png") - torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0) + assert psnr(ffmpeg.data.cpu(), beta.data.cpu()) > 25 + # torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0) @needs_cuda def test_beta_cuda_interface_error(self): From c6bda331fe9827c9730344f7c3079c6b322c15fe Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Oct 2025 22:29:55 +0100 Subject: [PATCH 05/10] WIP --- .../_core/BetaCudaDeviceInterface.cpp | 95 ++++++++++++------- src/torchcodec/_core/CpuDeviceInterface.cpp | 5 +- src/torchcodec/_core/CpuDeviceInterface.h | 1 - src/torchcodec/_core/FFMPEGCommon.h | 5 +- test/test_decoders.py | 3 +- 5 files changed, 67 insertions(+), 42 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index a08309752..2803c5539 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -215,7 +215,8 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { } // namespace BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device) - : DeviceInterface(device), prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) { + : DeviceInterface(device), + prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) { TORCH_CHECK(g_cuda_beta, "BetaCudaDeviceInterface was not registered!"); TORCH_CHECK( device_.type() == torch::kCUDA, "Unsupported device: ", device_.str()); @@ -679,7 +680,6 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( int width = cpuFrame->width; int height = cpuFrame->height; - // Step 1: Convert to NV12 on CPU using cached swscale context UniqueAVFrame nv12CpuFrame(av_frame_alloc()); TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame"); @@ -688,13 +688,17 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( nv12CpuFrame->height = height; int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0); - TORCH_CHECK(ret >= 0, "Failed to allocate NV12 CPU frame buffer: ", - getFFMPEGErrorStringFromErrorCode(ret)); + TORCH_CHECK( + ret >= 0, + "Failed to allocate NV12 CPU frame buffer: ", + getFFMPEGErrorStringFromErrorCode(ret)); - // Create or reuse swscale context using caching logic SwsFrameContext swsFrameContext( - width, height, static_cast(cpuFrame->format), - width, height); + width, + height, + static_cast(cpuFrame->format), + width, + height); if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) { swsContext_ = createSwsContext( @@ -704,21 +708,27 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( int convertedHeight = sws_scale( swsContext_.get(), - const_cast(cpuFrame->data), cpuFrame->linesize, - 0, height, - nv12CpuFrame->data, nv12CpuFrame->linesize); - TORCH_CHECK(convertedHeight == height, "sws_scale failed for CPU->NV12 conversion"); + const_cast(cpuFrame->data), + cpuFrame->linesize, + 0, + height, + nv12CpuFrame->data, + nv12CpuFrame->linesize); + TORCH_CHECK( + convertedHeight == height, "sws_scale failed for CPU->NV12 conversion"); - // Step 2: Allocate CUDA memory int ySize = width * height; int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane - size_t totalSize = ySize + uvSize; + size_t totalSize = static_cast(ySize + uvSize); uint8_t* cudaBuffer = nullptr; - cudaError_t err = cudaMalloc(reinterpret_cast(&cudaBuffer), totalSize); - TORCH_CHECK(err == cudaSuccess, "Failed to allocate CUDA memory: ", cudaGetErrorString(err)); + cudaError_t err = + cudaMalloc(reinterpret_cast(&cudaBuffer), totalSize); + TORCH_CHECK( + err == cudaSuccess, + "Failed to allocate CUDA memory: ", + cudaGetErrorString(err)); - // Step 3: Create GPU frame UniqueAVFrame gpuFrame(av_frame_alloc()); TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame"); @@ -730,33 +740,47 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( gpuFrame->linesize[0] = width; gpuFrame->linesize[1] = width; - // Step 4: Copy data from CPU NV12 to GPU using cudaMemcpy2D for safety err = cudaMemcpy2D( - gpuFrame->data[0], gpuFrame->linesize[0], - nv12CpuFrame->data[0], nv12CpuFrame->linesize[0], - width, height, + gpuFrame->data[0], + gpuFrame->linesize[0], + nv12CpuFrame->data[0], + nv12CpuFrame->linesize[0], + width, + height, cudaMemcpyHostToDevice); - TORCH_CHECK(err == cudaSuccess, "Failed to copy Y plane to GPU: ", cudaGetErrorString(err)); + TORCH_CHECK( + err == cudaSuccess, + "Failed to copy Y plane to GPU: ", + cudaGetErrorString(err)); err = cudaMemcpy2D( - gpuFrame->data[1], gpuFrame->linesize[1], - nv12CpuFrame->data[1], nv12CpuFrame->linesize[1], - width, height / 2, + gpuFrame->data[1], + gpuFrame->linesize[1], + nv12CpuFrame->data[1], + nv12CpuFrame->linesize[1], + width, + height / 2, cudaMemcpyHostToDevice); - TORCH_CHECK(err == cudaSuccess, "Failed to copy UV plane to GPU: ", cudaGetErrorString(err)); + TORCH_CHECK( + err == cudaSuccess, + "Failed to copy UV plane to GPU: ", + cudaGetErrorString(err)); - // Step 5: Set up proper GPU memory cleanup using AVFrame's reference counting ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get()); - TORCH_CHECK(ret >= 0, "Failed to copy frame properties: ", - getFFMPEGErrorStringFromErrorCode(ret)); + TORCH_CHECK( + ret >= 0, + "Failed to copy frame properties: ", + getFFMPEGErrorStringFromErrorCode(ret)); - // Create a buffer reference that will automatically free CUDA memory when frame is destroyed gpuFrame->opaque_ref = av_buffer_create( - reinterpret_cast(cudaBuffer), 0, // size=0 since we're not using the data pointer - cudaBufferFreeCallback, - cudaBuffer, // pass the actual CUDA buffer as opaque data - 0); - TORCH_CHECK(gpuFrame->opaque_ref != nullptr, "Failed to create GPU memory cleanup reference"); + nullptr, // data + 0, // data size + cudaBufferFreeCallback, // callback triggered by av_frame_free() + cudaBuffer, // parameter to callback + 0); // flags + TORCH_CHECK( + gpuFrame->opaque_ref != nullptr, + "Failed to create GPU memory cleanup reference"); return gpuFrame; } @@ -765,7 +789,8 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) { - UniqueAVFrame gpuFrame = cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame); + UniqueAVFrame gpuFrame = + cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame); // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24(). diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp index 329f5af93..43db4bf0d 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.cpp +++ b/src/torchcodec/_core/CpuDeviceInterface.cpp @@ -15,9 +15,9 @@ static bool g_cpu = registerDeviceInterface( } // namespace - CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device) - : DeviceInterface(device), prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) { + : DeviceInterface(device), + prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) { TORCH_CHECK(g_cpu, "CpuDeviceInterface was not registered!"); TORCH_CHECK( device_.type() == torch::kCPU, "Unsupported device: ", device_.str()); @@ -254,7 +254,6 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale( return resultHeight; } - torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph( const UniqueAVFrame& avFrame, const FrameDims& outputDims) { diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h index 4c6e38698..f7c57045a 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.h +++ b/src/torchcodec/_core/CpuDeviceInterface.h @@ -54,7 +54,6 @@ class CpuDeviceInterface : public DeviceInterface { ColorConversionLibrary getColorConversionLibrary( const FrameDims& inputFrameDims) const; - VideoStreamOptions videoStreamOptions_; AVRational timeBase_; diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h index a5c0ff980..683e2025e 100644 --- a/src/torchcodec/_core/FFMPEGCommon.h +++ b/src/torchcodec/_core/FFMPEGCommon.h @@ -6,10 +6,10 @@ #pragma once +#include #include #include #include -#include extern "C" { #include @@ -104,7 +104,8 @@ using UniqueAVBufferSrcParameters = std::unique_ptr< AVBufferSrcParameters, Deleterv>; -// Common swscale context management for efficient reuse across device interfaces +// Common swscale context management for efficient reuse across device +// interfaces struct SwsFrameContext { int inputWidth; int inputHeight; diff --git a/test/test_decoders.py b/test/test_decoders.py index 75cd225a2..ddd2e9189 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1712,9 +1712,10 @@ def test_beta_cuda_interface_cpu_fallback(self): ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) with set_cuda_backend("beta"): beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) - + from torchvision.io import write_png from torchvision.utils import make_grid + write_png(make_grid([ffmpeg.data, beta.data], nrow=2).cpu(), "out.png") assert psnr(ffmpeg.data.cpu(), beta.data.cpu()) > 25 From f4c8f4ee3409103d1ec860c42f451d6a1615a70f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Oct 2025 22:46:14 +0100 Subject: [PATCH 06/10] WIP --- src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 3 +-- src/torchcodec/_core/CpuDeviceInterface.cpp | 3 +-- src/torchcodec/_core/FFMPEGCommon.h | 11 ++++++----- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 2803c5539..979a8af9d 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -215,8 +215,7 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { } // namespace BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device) - : DeviceInterface(device), - prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) { + : DeviceInterface(device) { TORCH_CHECK(g_cuda_beta, "BetaCudaDeviceInterface was not registered!"); TORCH_CHECK( device_.type() == torch::kCUDA, "Unsupported device: ", device_.str()); diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp index 43db4bf0d..bb0988a13 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.cpp +++ b/src/torchcodec/_core/CpuDeviceInterface.cpp @@ -16,8 +16,7 @@ static bool g_cpu = registerDeviceInterface( } // namespace CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device) - : DeviceInterface(device), - prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) { + : DeviceInterface(device) { TORCH_CHECK(g_cpu, "CpuDeviceInterface was not registered!"); TORCH_CHECK( device_.type() == torch::kCPU, "Unsupported device: ", device_.str()); diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h index 683e2025e..68bd7fd02 100644 --- a/src/torchcodec/_core/FFMPEGCommon.h +++ b/src/torchcodec/_core/FFMPEGCommon.h @@ -107,12 +107,13 @@ using UniqueAVBufferSrcParameters = std::unique_ptr< // Common swscale context management for efficient reuse across device // interfaces struct SwsFrameContext { - int inputWidth; - int inputHeight; - AVPixelFormat inputFormat; - int outputWidth; - int outputHeight; + int inputWidth = 0; + int inputHeight = 0; + AVPixelFormat inputFormat = AV_PIX_FMT_NONE; + int outputWidth = 0; + int outputHeight = 0; + SwsFrameContext() = default; SwsFrameContext( int inputWidth, int inputHeight, From 340974ab6a0a5f57d7414b58be7392e459d3e235 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Oct 2025 23:36:35 +0100 Subject: [PATCH 07/10] Docs --- .../_core/BetaCudaDeviceInterface.cpp | 38 +++++++++---- .../_core/BetaCudaDeviceInterface.h | 5 +- src/torchcodec/_core/FFMPEGCommon.h | 54 +++++++++---------- 3 files changed, 55 insertions(+), 42 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 979a8af9d..73f760287 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -212,6 +212,12 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) { return true; } +// Callback for freeing CUDA memory associated with AVFrame see where it's used +// for more details. +void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) { + cudaFree(opaque); +} + } // namespace BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device) @@ -665,20 +671,23 @@ void BetaCudaDeviceInterface::flush() { std::swap(readyFrames_, emptyQueue); } -namespace { -// Cleanup callback for CUDA memory allocated for GPU frames -void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) { - cudaFree(opaque); -} -} // namespace - UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( UniqueAVFrame& cpuFrame) { + // This is called in the context of the CPU fallback: the frame was decoded on + // the CPU, and in this function we convert that frame into NV12 format and + // send it to the GPU. + // We do that in 2 steps: + // - First we convert the input CPU frame into an intermediate NV12 CPU frame + // using sws_scale. + // - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This + // is what we return + TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null"); int width = cpuFrame->width; int height = cpuFrame->height; + // intermediate NV12 CPU frame. It's not on the GPU yet. UniqueAVFrame nv12CpuFrame(av_frame_alloc()); TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame"); @@ -707,7 +716,7 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( int convertedHeight = sws_scale( swsContext_.get(), - const_cast(cpuFrame->data), + cpuFrame->data, cpuFrame->linesize, 0, height, @@ -739,6 +748,9 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( gpuFrame->linesize[0] = width; gpuFrame->linesize[1] = width; + // Note that we use cudaMemcpy2D here instead of cudaMemcpy because the + // linesizes (strides) may be different than the widths for the input CPU + // frame. That's precisely what cudaMemcpy2D is for. err = cudaMemcpy2D( gpuFrame->data[0], gpuFrame->linesize[0], @@ -771,10 +783,16 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( "Failed to copy frame properties: ", getFFMPEGErrorStringFromErrorCode(ret)); + // We're almost done, but we need to make sure the CUDA memory is freed + // properly. Usually, AVFrame data is freed when av_frame_free() is called + // (upon UniqueAVFrame destruction), but since we allocated the CUDA memory + // ourselves, FFmpeg doesn't know how to free it. The recommended way to deal + // with this is to associate the opaque_ref field of the AVFrame with a `free` + // callback that will then be called by av_frame_free(). gpuFrame->opaque_ref = av_buffer_create( - nullptr, // data + nullptr, // data - we don't need any 0, // data size - cudaBufferFreeCallback, // callback triggered by av_frame_free() + cudaBufferFreeCallback, // callback triggered by av_frame_free() cudaBuffer, // parameter to callback 0); // flags TORCH_CHECK( diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index 1fc97453c..ac94e55fd 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -81,7 +81,6 @@ class BetaCudaDeviceInterface : public DeviceInterface { unsigned int pitch, const CUVIDPARSERDISPINFO& dispInfo); - // Convert CPU frame to GPU NV12 frame for GPU color conversion UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame); CUvideoparser videoParser_ = nullptr; @@ -100,11 +99,9 @@ class BetaCudaDeviceInterface : public DeviceInterface { // NPP context for color conversion UniqueNppContext nppCtx_; - // Swscale context caching for CPU->GPU NV12 conversion + std::unique_ptr cpuFallback_; UniqueSwsContext swsContext_; SwsFrameContext prevSwsFrameContext_; - - std::unique_ptr cpuFallback_; }; } // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h index 68bd7fd02..2d58abfb2 100644 --- a/src/torchcodec/_core/FFMPEGCommon.h +++ b/src/torchcodec/_core/FFMPEGCommon.h @@ -104,34 +104,6 @@ using UniqueAVBufferSrcParameters = std::unique_ptr< AVBufferSrcParameters, Deleterv>; -// Common swscale context management for efficient reuse across device -// interfaces -struct SwsFrameContext { - int inputWidth = 0; - int inputHeight = 0; - AVPixelFormat inputFormat = AV_PIX_FMT_NONE; - int outputWidth = 0; - int outputHeight = 0; - - SwsFrameContext() = default; - SwsFrameContext( - int inputWidth, - int inputHeight, - AVPixelFormat inputFormat, - int outputWidth, - int outputHeight); - - bool operator==(const SwsFrameContext& other) const; - bool operator!=(const SwsFrameContext& other) const; -}; - -// Utility functions for swscale context management -UniqueSwsContext createSwsContext( - const SwsFrameContext& swsFrameContext, - AVColorSpace colorspace, - AVPixelFormat outputFormat = AV_PIX_FMT_RGB24, - int swsFlags = SWS_BILINEAR); - // These 2 classes share the same underlying AVPacket object. They are meant to // be used in tandem, like so: // @@ -279,4 +251,30 @@ AVFilterContext* createBuffersinkFilter( AVFilterGraph* filterGraph, enum AVPixelFormat outputFormat); +struct SwsFrameContext { + int inputWidth = 0; + int inputHeight = 0; + AVPixelFormat inputFormat = AV_PIX_FMT_NONE; + int outputWidth = 0; + int outputHeight = 0; + + SwsFrameContext() = default; + SwsFrameContext( + int inputWidth, + int inputHeight, + AVPixelFormat inputFormat, + int outputWidth, + int outputHeight); + + bool operator==(const SwsFrameContext& other) const; + bool operator!=(const SwsFrameContext& other) const; +}; + +// Utility functions for swscale context management +UniqueSwsContext createSwsContext( + const SwsFrameContext& swsFrameContext, + AVColorSpace colorspace, + AVPixelFormat outputFormat = AV_PIX_FMT_RGB24, + int swsFlags = SWS_BILINEAR); + } // namespace facebook::torchcodec From 3afc97f1d4b54c9ad52fde8bca3374299358de8b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Oct 2025 18:47:19 +0100 Subject: [PATCH 08/10] Add test --- test/test_decoders.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/test/test_decoders.py b/test/test_decoders.py index ddd2e9189..5f0f05668 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1709,17 +1709,23 @@ def test_beta_cuda_interface_cpu_fallback(self): # fallbacks to the CPU path in such cases. We assert that we fall back # to the CPU path, too. - ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) - with set_cuda_backend("beta"): - beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) + ref_dec = VideoDecoder(H265_VIDEO.path, device="cuda") + ref_frames = ref_dec.get_frame_at(0) + assert ( + _core._get_backend_details(ref_dec._decoder) + == "FFmpeg CUDA Device Interface. Using CPU fallback." + ) - from torchvision.io import write_png - from torchvision.utils import make_grid + with set_cuda_backend("beta"): + beta_dec = VideoDecoder(H265_VIDEO.path, device="cuda") - write_png(make_grid([ffmpeg.data, beta.data], nrow=2).cpu(), "out.png") + assert ( + _core._get_backend_details(beta_dec._decoder) + == "Beta CUDA Device Interface. Using CPU fallback." + ) + beta_frame = beta_dec.get_frame_at(0) - assert psnr(ffmpeg.data.cpu(), beta.data.cpu()) > 25 - # torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0) + assert psnr(ref_frames.data.cpu(), beta_frame.data.cpu()) > 25 @needs_cuda def test_beta_cuda_interface_error(self): From 89f1547ab596bd3c4d57b284120253f4721d6c13 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 25 Oct 2025 01:36:37 +0100 Subject: [PATCH 09/10] remove .cpu() call --- test/test_decoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_decoders.py b/test/test_decoders.py index 5f0f05668..5e5028da6 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1725,7 +1725,7 @@ def test_beta_cuda_interface_cpu_fallback(self): ) beta_frame = beta_dec.get_frame_at(0) - assert psnr(ref_frames.data.cpu(), beta_frame.data.cpu()) > 25 + assert psnr(ref_frames.data, beta_frame.data) > 25 @needs_cuda def test_beta_cuda_interface_error(self): From 61d4c7fc1ea490bfa501b4c0beacc5436a81a20d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 29 Oct 2025 17:34:04 +0000 Subject: [PATCH 10/10] Add evenness checks before dividing by 2 --- src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 8437f3d17..587456f34 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -729,6 +729,9 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( convertedHeight == height, "sws_scale failed for CPU->NV12 conversion"); int ySize = width * height; + TORCH_CHECK( + ySize % 2 == 0, + "Y plane size must be even. Please report on TorchCodec repo."); int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane size_t totalSize = static_cast(ySize + uvSize); @@ -767,6 +770,9 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12( "Failed to copy Y plane to GPU: ", cudaGetErrorString(err)); + TORCH_CHECK( + height % 2 == 0, + "height must be even. Please report on TorchCodec repo."); err = cudaMemcpy2D( gpuFrame->data[1], gpuFrame->linesize[1],