From bf3e29ba6859eae5e563da3062257e15f8b9ed4c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Tue, 21 Oct 2025 17:43:59 +0100
Subject: [PATCH 01/10] WIP

---
 .../_core/BetaCudaDeviceInterface.cpp         | 112 ++++++++++++++++--
 .../_core/BetaCudaDeviceInterface.h           |   3 +
 2 files changed, 104 insertions(+), 11 deletions(-)
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 07ed92126..00f31816f 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -665,22 +665,112 @@ void BetaCudaDeviceInterface::flush() {
   std::swap(readyFrames_, emptyQueue);
 }
 
+namespace {
+// Cleanup callback for CUDA memory allocated for GPU frames
+void cudaBufferFreeCallback(void* opaque, uint8_t* data) {
+  cudaFree(opaque);
+}
+} // namespace
+
+UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
+    UniqueAVFrame& cpuFrame) {
+  TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");
+
+  int width = cpuFrame->width;
+  int height = cpuFrame->height;
+
+  // Step 1: Convert to NV12 on CPU using swscale
+  UniqueAVFrame nv12CpuFrame(av_frame_alloc());
+  TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");
+
+  nv12CpuFrame->format = AV_PIX_FMT_NV12;
+  nv12CpuFrame->width = width;
+  nv12CpuFrame->height = height;
+
+  int ret = av_frame_get_buffer(nv12CpuFrame.get(), 32);
+  TORCH_CHECK(ret >= 0, "Failed to allocate NV12 CPU frame buffer: ",
+              getFFMPEGErrorStringFromErrorCode(ret));
+
+  UniqueSwsContext swsCtx(sws_getContext(
+      width, height, static_cast<AVPixelFormat>(cpuFrame->format),
+      width, height, AV_PIX_FMT_NV12,
+      SWS_BILINEAR, nullptr, nullptr, nullptr));
+  TORCH_CHECK(swsCtx != nullptr, "Failed to create SwsContext for CPU->NV12 conversion");
+
+  int convertedHeight = sws_scale(
+      swsCtx.get(),
+      const_cast<const uint8_t* const*>(cpuFrame->data), cpuFrame->linesize,
+      0, height,
+      nv12CpuFrame->data, nv12CpuFrame->linesize);
+  TORCH_CHECK(convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");
+
+  // Step 2: Allocate CUDA memory
+  int ySize = width * height;
+  int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
+  size_t totalSize = ySize + uvSize;
+
+  uint8_t* cudaBuffer = nullptr;
+  cudaError_t err = cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
+  TORCH_CHECK(err == cudaSuccess, "Failed to allocate CUDA memory: ", cudaGetErrorString(err));
+
+  // Step 3: Create GPU frame
+  UniqueAVFrame gpuFrame(av_frame_alloc());
+  TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame");
+
+  gpuFrame->format = AV_PIX_FMT_CUDA;
+  gpuFrame->width = width;
+  gpuFrame->height = height;
+  gpuFrame->data[0] = cudaBuffer;
+  gpuFrame->data[1] = cudaBuffer + ySize;
+  gpuFrame->linesize[0] = width;
+  gpuFrame->linesize[1] = width;
+
+  // Step 4: Copy data from CPU NV12 to GPU using cudaMemcpy2D for safety
+  err = cudaMemcpy2D(
+      gpuFrame->data[0], gpuFrame->linesize[0],
+      nv12CpuFrame->data[0], nv12CpuFrame->linesize[0],
+      width, height,
+      cudaMemcpyHostToDevice);
+  TORCH_CHECK(err == cudaSuccess, "Failed to copy Y plane to GPU: ", cudaGetErrorString(err));
+
+  err = cudaMemcpy2D(
+      gpuFrame->data[1], gpuFrame->linesize[1],
+      nv12CpuFrame->data[1], nv12CpuFrame->linesize[1],
+      width, height / 2,
+      cudaMemcpyHostToDevice);
+  TORCH_CHECK(err == cudaSuccess, "Failed to copy UV plane to GPU: ", cudaGetErrorString(err));
+
+  // Step 5: Set up proper GPU memory cleanup using AVFrame's reference counting
+  ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get());
+  TORCH_CHECK(ret >= 0, "Failed to copy frame properties: ",
+              getFFMPEGErrorStringFromErrorCode(ret));
+
+  // Create a buffer reference that will automatically free CUDA memory when frame is destroyed
+  gpuFrame->opaque_ref = av_buffer_create(
+      reinterpret_cast<uint8_t*>(cudaBuffer), 0, // size=0 since we're not using the data pointer
+      cudaBufferFreeCallback,
+      cudaBuffer, // pass the actual CUDA buffer as opaque data
+      0);
+  TORCH_CHECK(gpuFrame->opaque_ref != nullptr, "Failed to create GPU memory cleanup reference");
+
+  return gpuFrame;
+}
+
 void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   if (cpuFallback_) {
-    // CPU decoded frame - need to do CPU color conversion then transfer to GPU
-    FrameOutput cpuFrameOutput;
-    cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
-
-    // Transfer CPU frame to GPU
-    if (preAllocatedOutputTensor.has_value()) {
-      preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
-      frameOutput.data = preAllocatedOutputTensor.value();
-    } else {
-      frameOutput.data = cpuFrameOutput.data.to(device_);
-    }
+    // CPU decoded frame - convert to GPU NV12 and do GPU color conversion
+    UniqueAVFrame gpuNV12Frame = transferCpuFrameToGpuNV12(avFrame);
+
+    validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuNV12Frame);
+
+    at::cuda::CUDAStream nvdecStream =
+        at::cuda::getCurrentCUDAStream(device_.index());
+
+    frameOutput.data = convertNV12FrameToRGB(
+        gpuNV12Frame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
     return;
   }
 
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index 3a9520867..a1f913e6c 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -81,6 +81,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
       unsigned int pitch,
       const CUVIDPARSERDISPINFO& dispInfo);
 
+  // Convert CPU frame to GPU NV12 frame for GPU color conversion
+  UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame);
+
   CUvideoparser videoParser_ = nullptr;
   UniqueCUvideodecoder decoder_;
   CUVIDEOFORMAT videoFormat_ = {};

From d86a19ec9bac3338935a225c29db773de0aba28a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Tue, 21 Oct 2025 18:16:43 +0100
Subject: [PATCH 02/10] WIP

---
 src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 00f31816f..cd3f21136 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -667,7 +667,7 @@ void BetaCudaDeviceInterface::flush() {
 
 namespace {
 // Cleanup callback for CUDA memory allocated for GPU frames
-void cudaBufferFreeCallback(void* opaque, uint8_t* data) {
+void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
   cudaFree(opaque);
 }
 } // namespace

From 7f88e1bd7acfc50f48306b8ca8907917d6ec3cf6 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Tue, 21 Oct 2025 19:08:47 +0100
Subject: [PATCH 03/10] WIP

---
 .../_core/BetaCudaDeviceInterface.cpp         | 21 +++++--------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index cd3f21136..76bd9b112 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -760,33 +760,22 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  if (cpuFallback_) {
-    // CPU decoded frame - convert to GPU NV12 and do GPU color conversion
-    UniqueAVFrame gpuNV12Frame = transferCpuFrameToGpuNV12(avFrame);
-
-    validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuNV12Frame);
-
-    at::cuda::CUDAStream nvdecStream =
-        at::cuda::getCurrentCUDAStream(device_.index());
-
-    frameOutput.data = convertNV12FrameToRGB(
-        gpuNV12Frame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
-    return;
-  }
+  // Convert CPU frame to GPU NV12 if using CPU fallback, otherwise use existing GPU frame
+  UniqueAVFrame gpuFrame = cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame);
 
   // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
   // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
   TORCH_CHECK(
-      avFrame->format == AV_PIX_FMT_CUDA,
+      gpuFrame->format == AV_PIX_FMT_CUDA,
       "Expected CUDA format frame from BETA CUDA interface");
 
-  validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame);
+  validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuFrame);
 
   at::cuda::CUDAStream nvdecStream =
       at::cuda::getCurrentCUDAStream(device_.index());
 
   frameOutput.data = convertNV12FrameToRGB(
-      avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
+      gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
 }
 
 std::string BetaCudaDeviceInterface::getDetails() {

From 5c61a96c1c04e7dcc40f53da37548014da6ecdaf Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Tue, 21 Oct 2025 19:58:23 +0100
Subject: [PATCH 04/10] WIP

---
 .../_core/BetaCudaDeviceInterface.cpp         | 22 +++---
 .../_core/BetaCudaDeviceInterface.h           |  4 ++
 src/torchcodec/_core/CpuDeviceInterface.cpp   | 72 +------------------
 src/torchcodec/_core/CpuDeviceInterface.h     | 21 ------
 src/torchcodec/_core/FFMPEGCommon.cpp         | 69 ++++++++++++++++++
 src/torchcodec/_core/FFMPEGCommon.h           | 27 +++++++
 test/test_decoders.py                         |  7 +-
 7 files changed, 122 insertions(+), 100 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 76bd9b112..a08309752 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -215,7 +215,7 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
 } // namespace
 
 BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
-    : DeviceInterface(device) {
+    : DeviceInterface(device), prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) {
   TORCH_CHECK(g_cuda_beta, "BetaCudaDeviceInterface was not registered!");
   TORCH_CHECK(
       device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
@@ -679,7 +679,7 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
   int width = cpuFrame->width;
   int height = cpuFrame->height;
 
-  // Step 1: Convert to NV12 on CPU using swscale
+  // Step 1: Convert to NV12 on CPU using cached swscale context
   UniqueAVFrame nv12CpuFrame(av_frame_alloc());
   TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");
 
@@ -687,18 +687,23 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
   nv12CpuFrame->width = width;
   nv12CpuFrame->height = height;
 
-  int ret = av_frame_get_buffer(nv12CpuFrame.get(), 32);
+  int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0);
   TORCH_CHECK(ret >= 0, "Failed to allocate NV12 CPU frame buffer: ",
               getFFMPEGErrorStringFromErrorCode(ret));
 
-  UniqueSwsContext swsCtx(sws_getContext(
+  // Create or reuse swscale context using caching logic
+  SwsFrameContext swsFrameContext(
       width, height, static_cast<AVPixelFormat>(cpuFrame->format),
-      width, height, AV_PIX_FMT_NV12,
-      SWS_BILINEAR, nullptr, nullptr, nullptr));
-  TORCH_CHECK(swsCtx != nullptr, "Failed to create SwsContext for CPU->NV12 conversion");
+      width, height);
+
+  if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
+    swsContext_ = createSwsContext(
+        swsFrameContext, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR);
+    prevSwsFrameContext_ = swsFrameContext;
+  }
 
   int convertedHeight = sws_scale(
-      swsCtx.get(),
+      swsContext_.get(),
       const_cast<const uint8_t* const*>(cpuFrame->data), cpuFrame->linesize,
       0, height,
       nv12CpuFrame->data, nv12CpuFrame->linesize);
@@ -760,7 +765,6 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  // Convert CPU frame to GPU NV12 if using CPU fallback, otherwise use existing GPU frame
   UniqueAVFrame gpuFrame = cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame);
 
   // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index a1f913e6c..1fc97453c 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -100,6 +100,10 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   // NPP context for color conversion
   UniqueNppContext nppCtx_;
 
+  // Swscale context caching for CPU->GPU NV12 conversion
+  UniqueSwsContext swsContext_;
+  SwsFrameContext prevSwsFrameContext_;
+
   std::unique_ptr<DeviceInterface> cpuFallback_;
 };
 
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
index 5aa20b09e..329f5af93 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.cpp
+++ b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -15,32 +15,9 @@ static bool g_cpu = registerDeviceInterface(
 
 } // namespace
 
-CpuDeviceInterface::SwsFrameContext::SwsFrameContext(
-    int inputWidth,
-    int inputHeight,
-    AVPixelFormat inputFormat,
-    int outputWidth,
-    int outputHeight)
-    : inputWidth(inputWidth),
-      inputHeight(inputHeight),
-      inputFormat(inputFormat),
-      outputWidth(outputWidth),
-      outputHeight(outputHeight) {}
-
-bool CpuDeviceInterface::SwsFrameContext::operator==(
-    const CpuDeviceInterface::SwsFrameContext& other) const {
-  return inputWidth == other.inputWidth && inputHeight == other.inputHeight &&
-      inputFormat == other.inputFormat && outputWidth == other.outputWidth &&
-      outputHeight == other.outputHeight;
-}
-
-bool CpuDeviceInterface::SwsFrameContext::operator!=(
-    const CpuDeviceInterface::SwsFrameContext& other) const {
-  return !(*this == other);
-}
 
 CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
-    : DeviceInterface(device) {
+    : DeviceInterface(device), prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) {
   TORCH_CHECK(g_cpu, "CpuDeviceInterface was not registered!");
   TORCH_CHECK(
       device_.type() == torch::kCPU, "Unsupported device: ", device_.str());
@@ -257,7 +234,8 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
       outputDims.height);
 
   if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
-    createSwsContext(swsFrameContext, avFrame->colorspace);
+    swsContext_ = createSwsContext(
+        swsFrameContext, avFrame->colorspace, AV_PIX_FMT_RGB24, swsFlags_);
     prevSwsFrameContext_ = swsFrameContext;
   }
 
@@ -276,50 +254,6 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
   return resultHeight;
 }
 
-void CpuDeviceInterface::createSwsContext(
-    const SwsFrameContext& swsFrameContext,
-    const enum AVColorSpace colorspace) {
-  SwsContext* swsContext = sws_getContext(
-      swsFrameContext.inputWidth,
-      swsFrameContext.inputHeight,
-      swsFrameContext.inputFormat,
-      swsFrameContext.outputWidth,
-      swsFrameContext.outputHeight,
-      AV_PIX_FMT_RGB24,
-      swsFlags_,
-      nullptr,
-      nullptr,
-      nullptr);
-  TORCH_CHECK(swsContext, "sws_getContext() returned nullptr");
-
-  int* invTable = nullptr;
-  int* table = nullptr;
-  int srcRange, dstRange, brightness, contrast, saturation;
-  int ret = sws_getColorspaceDetails(
-      swsContext,
-      &invTable,
-      &srcRange,
-      &table,
-      &dstRange,
-      &brightness,
-      &contrast,
-      &saturation);
-  TORCH_CHECK(ret != -1, "sws_getColorspaceDetails returned -1");
-
-  const int* colorspaceTable = sws_getCoefficients(colorspace);
-  ret = sws_setColorspaceDetails(
-      swsContext,
-      colorspaceTable,
-      srcRange,
-      colorspaceTable,
-      dstRange,
-      brightness,
-      contrast,
-      saturation);
-  TORCH_CHECK(ret != -1, "sws_setColorspaceDetails returned -1");
-
-  swsContext_.reset(swsContext);
-}
 
 torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
     const UniqueAVFrame& avFrame,
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
index 3f6f7c962..4c6e38698 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.h
+++ b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -54,27 +54,6 @@ class CpuDeviceInterface : public DeviceInterface {
   ColorConversionLibrary getColorConversionLibrary(
       const FrameDims& inputFrameDims) const;
 
-  struct SwsFrameContext {
-    int inputWidth = 0;
-    int inputHeight = 0;
-    AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
-    int outputWidth = 0;
-    int outputHeight = 0;
-
-    SwsFrameContext() = default;
-    SwsFrameContext(
-        int inputWidth,
-        int inputHeight,
-        AVPixelFormat inputFormat,
-        int outputWidth,
-        int outputHeight);
-    bool operator==(const SwsFrameContext&) const;
-    bool operator!=(const SwsFrameContext&) const;
-  };
-
-  void createSwsContext(
-      const SwsFrameContext& swsFrameContext,
-      const enum AVColorSpace colorspace);
 
   VideoStreamOptions videoStreamOptions_;
   AVRational timeBase_;
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
index 97ff082e1..b9663d8d2 100644
--- a/src/torchcodec/_core/FFMPEGCommon.cpp
+++ b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -605,4 +605,73 @@ int64_t computeSafeDuration(
   }
 }
 
+SwsFrameContext::SwsFrameContext(
+    int inputWidth,
+    int inputHeight,
+    AVPixelFormat inputFormat,
+    int outputWidth,
+    int outputHeight)
+    : inputWidth(inputWidth),
+      inputHeight(inputHeight),
+      inputFormat(inputFormat),
+      outputWidth(outputWidth),
+      outputHeight(outputHeight) {}
+
+bool SwsFrameContext::operator==(const SwsFrameContext& other) const {
+  return inputWidth == other.inputWidth && inputHeight == other.inputHeight &&
+      inputFormat == other.inputFormat && outputWidth == other.outputWidth &&
+      outputHeight == other.outputHeight;
+}
+
+bool SwsFrameContext::operator!=(const SwsFrameContext& other) const {
+  return !(*this == other);
+}
+
+UniqueSwsContext createSwsContext(
+    const SwsFrameContext& swsFrameContext,
+    AVColorSpace colorspace,
+    AVPixelFormat outputFormat,
+    int swsFlags) {
+  SwsContext* swsContext = sws_getContext(
+      swsFrameContext.inputWidth,
+      swsFrameContext.inputHeight,
+      swsFrameContext.inputFormat,
+      swsFrameContext.outputWidth,
+      swsFrameContext.outputHeight,
+      outputFormat,
+      swsFlags,
+      nullptr,
+      nullptr,
+      nullptr);
+  TORCH_CHECK(swsContext, "sws_getContext() returned nullptr");
+
+  int* invTable = nullptr;
+  int* table = nullptr;
+  int srcRange, dstRange, brightness, contrast, saturation;
+  int ret = sws_getColorspaceDetails(
+      swsContext,
+      &invTable,
+      &srcRange,
+      &table,
+      &dstRange,
+      &brightness,
+      &contrast,
+      &saturation);
+  TORCH_CHECK(ret != -1, "sws_getColorspaceDetails returned -1");
+
+  const int* colorspaceTable = sws_getCoefficients(colorspace);
+  ret = sws_setColorspaceDetails(
+      swsContext,
+      colorspaceTable,
+      srcRange,
+      colorspaceTable,
+      dstRange,
+      brightness,
+      contrast,
+      saturation);
+  TORCH_CHECK(ret != -1, "sws_setColorspaceDetails returned -1");
+
+  return UniqueSwsContext(swsContext);
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
index 337616ddc..a5c0ff980 100644
--- a/src/torchcodec/_core/FFMPEGCommon.h
+++ b/src/torchcodec/_core/FFMPEGCommon.h
@@ -9,6 +9,7 @@
 #include <memory>
 #include <stdexcept>
 #include <string>
+#include <torch/types.h>
 
 extern "C" {
 #include <libavcodec/avcodec.h>
@@ -103,6 +104,32 @@ using UniqueAVBufferSrcParameters = std::unique_ptr<
     AVBufferSrcParameters,
     Deleterv<AVBufferSrcParameters, void, av_freep>>;
 
+// Common swscale context management for efficient reuse across device interfaces
+struct SwsFrameContext {
+  int inputWidth;
+  int inputHeight;
+  AVPixelFormat inputFormat;
+  int outputWidth;
+  int outputHeight;
+
+  SwsFrameContext(
+      int inputWidth,
+      int inputHeight,
+      AVPixelFormat inputFormat,
+      int outputWidth,
+      int outputHeight);
+
+  bool operator==(const SwsFrameContext& other) const;
+  bool operator!=(const SwsFrameContext& other) const;
+};
+
+// Utility functions for swscale context management
+UniqueSwsContext createSwsContext(
+    const SwsFrameContext& swsFrameContext,
+    AVColorSpace colorspace,
+    AVPixelFormat outputFormat = AV_PIX_FMT_RGB24,
+    int swsFlags = SWS_BILINEAR);
+
 // These 2 classes share the same underlying AVPacket object. They are meant to
 // be used in tandem, like so:
 //
diff --git a/test/test_decoders.py b/test/test_decoders.py
index 6e08e05a4..75cd225a2 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1712,8 +1712,13 @@ def test_beta_cuda_interface_cpu_fallback(self):
         ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
         with set_cuda_backend("beta"):
             beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
+        
+        from torchvision.io import write_png
+        from torchvision.utils import make_grid
+        write_png(make_grid([ffmpeg.data, beta.data], nrow=2).cpu(), "out.png")
 
-        torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0)
+        assert psnr(ffmpeg.data.cpu(), beta.data.cpu()) > 25
+        # torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0)
 
     @needs_cuda
     def test_beta_cuda_interface_error(self):

From c6bda331fe9827c9730344f7c3079c6b322c15fe Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Tue, 21 Oct 2025 22:29:55 +0100
Subject: [PATCH 05/10] WIP

---
 .../_core/BetaCudaDeviceInterface.cpp         | 95 ++++++++++++-------
 src/torchcodec/_core/CpuDeviceInterface.cpp   |  5 +-
 src/torchcodec/_core/CpuDeviceInterface.h     |  1 -
 src/torchcodec/_core/FFMPEGCommon.h           |  5 +-
 test/test_decoders.py                         |  3 +-
 5 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index a08309752..2803c5539 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -215,7 +215,8 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
 } // namespace
 
 BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
-    : DeviceInterface(device), prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) {
+    : DeviceInterface(device),
+      prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) {
   TORCH_CHECK(g_cuda_beta, "BetaCudaDeviceInterface was not registered!");
   TORCH_CHECK(
       device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
@@ -679,7 +680,6 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
   int width = cpuFrame->width;
   int height = cpuFrame->height;
 
-  // Step 1: Convert to NV12 on CPU using cached swscale context
   UniqueAVFrame nv12CpuFrame(av_frame_alloc());
   TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");
 
@@ -688,13 +688,17 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
   nv12CpuFrame->height = height;
 
   int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0);
-  TORCH_CHECK(ret >= 0, "Failed to allocate NV12 CPU frame buffer: ",
-              getFFMPEGErrorStringFromErrorCode(ret));
+  TORCH_CHECK(
+      ret >= 0,
+      "Failed to allocate NV12 CPU frame buffer: ",
+      getFFMPEGErrorStringFromErrorCode(ret));
 
-  // Create or reuse swscale context using caching logic
   SwsFrameContext swsFrameContext(
-      width, height, static_cast<AVPixelFormat>(cpuFrame->format),
-      width, height);
+      width,
+      height,
+      static_cast<AVPixelFormat>(cpuFrame->format),
+      width,
+      height);
 
   if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
     swsContext_ = createSwsContext(
@@ -704,21 +708,27 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
 
   int convertedHeight = sws_scale(
       swsContext_.get(),
-      const_cast<const uint8_t* const*>(cpuFrame->data), cpuFrame->linesize,
-      0, height,
-      nv12CpuFrame->data, nv12CpuFrame->linesize);
-  TORCH_CHECK(convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");
+      const_cast<const uint8_t* const*>(cpuFrame->data),
+      cpuFrame->linesize,
+      0,
+      height,
+      nv12CpuFrame->data,
+      nv12CpuFrame->linesize);
+  TORCH_CHECK(
+      convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");
 
-  // Step 2: Allocate CUDA memory
   int ySize = width * height;
   int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
-  size_t totalSize = ySize + uvSize;
+  size_t totalSize = static_cast<size_t>(ySize + uvSize);
 
   uint8_t* cudaBuffer = nullptr;
-  cudaError_t err = cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
-  TORCH_CHECK(err == cudaSuccess, "Failed to allocate CUDA memory: ", cudaGetErrorString(err));
+  cudaError_t err =
+      cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
+  TORCH_CHECK(
+      err == cudaSuccess,
+      "Failed to allocate CUDA memory: ",
+      cudaGetErrorString(err));
 
-  // Step 3: Create GPU frame
   UniqueAVFrame gpuFrame(av_frame_alloc());
   TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame");
 
@@ -730,33 +740,47 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
   gpuFrame->linesize[0] = width;
   gpuFrame->linesize[1] = width;
 
-  // Step 4: Copy data from CPU NV12 to GPU using cudaMemcpy2D for safety
   err = cudaMemcpy2D(
-      gpuFrame->data[0], gpuFrame->linesize[0],
-      nv12CpuFrame->data[0], nv12CpuFrame->linesize[0],
-      width, height,
+      gpuFrame->data[0],
+      gpuFrame->linesize[0],
+      nv12CpuFrame->data[0],
+      nv12CpuFrame->linesize[0],
+      width,
+      height,
       cudaMemcpyHostToDevice);
-  TORCH_CHECK(err == cudaSuccess, "Failed to copy Y plane to GPU: ", cudaGetErrorString(err));
+  TORCH_CHECK(
+      err == cudaSuccess,
+      "Failed to copy Y plane to GPU: ",
+      cudaGetErrorString(err));
 
   err = cudaMemcpy2D(
-      gpuFrame->data[1], gpuFrame->linesize[1],
-      nv12CpuFrame->data[1], nv12CpuFrame->linesize[1],
-      width, height / 2,
+      gpuFrame->data[1],
+      gpuFrame->linesize[1],
+      nv12CpuFrame->data[1],
+      nv12CpuFrame->linesize[1],
+      width,
+      height / 2,
       cudaMemcpyHostToDevice);
-  TORCH_CHECK(err == cudaSuccess, "Failed to copy UV plane to GPU: ", cudaGetErrorString(err));
+  TORCH_CHECK(
+      err == cudaSuccess,
+      "Failed to copy UV plane to GPU: ",
+      cudaGetErrorString(err));
 
-  // Step 5: Set up proper GPU memory cleanup using AVFrame's reference counting
   ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get());
-  TORCH_CHECK(ret >= 0, "Failed to copy frame properties: ",
-              getFFMPEGErrorStringFromErrorCode(ret));
+  TORCH_CHECK(
+      ret >= 0,
+      "Failed to copy frame properties: ",
+      getFFMPEGErrorStringFromErrorCode(ret));
 
-  // Create a buffer reference that will automatically free CUDA memory when frame is destroyed
   gpuFrame->opaque_ref = av_buffer_create(
-      reinterpret_cast<uint8_t*>(cudaBuffer), 0, // size=0 since we're not using the data pointer
-      cudaBufferFreeCallback,
-      cudaBuffer, // pass the actual CUDA buffer as opaque data
-      0);
-  TORCH_CHECK(gpuFrame->opaque_ref != nullptr, "Failed to create GPU memory cleanup reference");
+      nullptr, // data
+      0, // data size
+      cudaBufferFreeCallback,  // callback triggered by av_frame_free()
+      cudaBuffer, // parameter to callback
+      0); // flags
+  TORCH_CHECK(
+      gpuFrame->opaque_ref != nullptr,
+      "Failed to create GPU memory cleanup reference");
 
   return gpuFrame;
 }
@@ -765,7 +789,8 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  UniqueAVFrame gpuFrame = cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame);
+  UniqueAVFrame gpuFrame =
+      cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame);
 
   // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
   // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
index 329f5af93..43db4bf0d 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.cpp
+++ b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -15,9 +15,9 @@ static bool g_cpu = registerDeviceInterface(
 
 } // namespace
 
-
 CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
-    : DeviceInterface(device), prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) {
+    : DeviceInterface(device),
+      prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) {
   TORCH_CHECK(g_cpu, "CpuDeviceInterface was not registered!");
   TORCH_CHECK(
       device_.type() == torch::kCPU, "Unsupported device: ", device_.str());
@@ -254,7 +254,6 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
   return resultHeight;
 }
 
-
 torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
     const UniqueAVFrame& avFrame,
     const FrameDims& outputDims) {
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
index 4c6e38698..f7c57045a 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.h
+++ b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -54,7 +54,6 @@ class CpuDeviceInterface : public DeviceInterface {
   ColorConversionLibrary getColorConversionLibrary(
       const FrameDims& inputFrameDims) const;
 
-
   VideoStreamOptions videoStreamOptions_;
   AVRational timeBase_;
 
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
index a5c0ff980..683e2025e 100644
--- a/src/torchcodec/_core/FFMPEGCommon.h
+++ b/src/torchcodec/_core/FFMPEGCommon.h
@@ -6,10 +6,10 @@
 
 #pragma once
 
+#include <torch/types.h>
 #include <memory>
 #include <stdexcept>
 #include <string>
-#include <torch/types.h>
 
 extern "C" {
 #include <libavcodec/avcodec.h>
@@ -104,7 +104,8 @@ using UniqueAVBufferSrcParameters = std::unique_ptr<
     AVBufferSrcParameters,
     Deleterv<AVBufferSrcParameters, void, av_freep>>;
 
-// Common swscale context management for efficient reuse across device interfaces
+// Common swscale context management for efficient reuse across device
+// interfaces
 struct SwsFrameContext {
   int inputWidth;
   int inputHeight;
diff --git a/test/test_decoders.py b/test/test_decoders.py
index 75cd225a2..ddd2e9189 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1712,9 +1712,10 @@ def test_beta_cuda_interface_cpu_fallback(self):
         ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
         with set_cuda_backend("beta"):
             beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
-        
+
         from torchvision.io import write_png
         from torchvision.utils import make_grid
+
         write_png(make_grid([ffmpeg.data, beta.data], nrow=2).cpu(), "out.png")
 
         assert psnr(ffmpeg.data.cpu(), beta.data.cpu()) > 25

From f4c8f4ee3409103d1ec860c42f451d6a1615a70f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Tue, 21 Oct 2025 22:46:14 +0100
Subject: [PATCH 06/10] WIP

---
 src/torchcodec/_core/BetaCudaDeviceInterface.cpp |  3 +--
 src/torchcodec/_core/CpuDeviceInterface.cpp      |  3 +--
 src/torchcodec/_core/FFMPEGCommon.h              | 11 ++++++-----
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 2803c5539..979a8af9d 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -215,8 +215,7 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
 } // namespace
 
 BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
-    : DeviceInterface(device),
-      prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) {
+    : DeviceInterface(device) {
   TORCH_CHECK(g_cuda_beta, "BetaCudaDeviceInterface was not registered!");
   TORCH_CHECK(
       device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
index 43db4bf0d..bb0988a13 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.cpp
+++ b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -16,8 +16,7 @@ static bool g_cpu = registerDeviceInterface(
 } // namespace
 
 CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
-    : DeviceInterface(device),
-      prevSwsFrameContext_(0, 0, AV_PIX_FMT_NONE, 0, 0) {
+    : DeviceInterface(device) {
   TORCH_CHECK(g_cpu, "CpuDeviceInterface was not registered!");
   TORCH_CHECK(
       device_.type() == torch::kCPU, "Unsupported device: ", device_.str());
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
index 683e2025e..68bd7fd02 100644
--- a/src/torchcodec/_core/FFMPEGCommon.h
+++ b/src/torchcodec/_core/FFMPEGCommon.h
@@ -107,12 +107,13 @@ using UniqueAVBufferSrcParameters = std::unique_ptr<
 // Common swscale context management for efficient reuse across device
 // interfaces
 struct SwsFrameContext {
-  int inputWidth;
-  int inputHeight;
-  AVPixelFormat inputFormat;
-  int outputWidth;
-  int outputHeight;
+  int inputWidth = 0;
+  int inputHeight = 0;
+  AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
+  int outputWidth = 0;
+  int outputHeight = 0;
 
+  SwsFrameContext() = default;
   SwsFrameContext(
       int inputWidth,
       int inputHeight,

From 340974ab6a0a5f57d7414b58be7392e459d3e235 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Tue, 21 Oct 2025 23:36:35 +0100
Subject: [PATCH 07/10] Docs

---
 .../_core/BetaCudaDeviceInterface.cpp         | 38 +++++++++----
 .../_core/BetaCudaDeviceInterface.h           |  5 +-
 src/torchcodec/_core/FFMPEGCommon.h           | 54 +++++++++----------
 3 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 979a8af9d..73f760287 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -212,6 +212,12 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
   return true;
 }
 
+// Callback for freeing CUDA memory associated with AVFrame see where it's used
+// for more details.
+void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
+  cudaFree(opaque);
+}
+
 } // namespace
 
 BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
@@ -665,20 +671,23 @@ void BetaCudaDeviceInterface::flush() {
   std::swap(readyFrames_, emptyQueue);
 }
 
-namespace {
-// Cleanup callback for CUDA memory allocated for GPU frames
-void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
-  cudaFree(opaque);
-}
-} // namespace
-
 UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
     UniqueAVFrame& cpuFrame) {
+  // This is called in the context of the CPU fallback: the frame was decoded on
+  // the CPU, and in this function we convert that frame into NV12 format and
+  // send it to the GPU.
+  // We do that in 2 steps:
+  // - First we convert the input CPU frame into an intermediate NV12 CPU frame
+  //   using sws_scale.
+  // - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
+  //   is what we return
+
   TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");
 
   int width = cpuFrame->width;
   int height = cpuFrame->height;
 
+  // intermediate NV12 CPU frame. It's not on the GPU yet.
   UniqueAVFrame nv12CpuFrame(av_frame_alloc());
   TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");
 
@@ -707,7 +716,7 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
 
   int convertedHeight = sws_scale(
       swsContext_.get(),
-      const_cast<const uint8_t* const*>(cpuFrame->data),
+      cpuFrame->data,
       cpuFrame->linesize,
       0,
       height,
@@ -739,6 +748,9 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
   gpuFrame->linesize[0] = width;
   gpuFrame->linesize[1] = width;
 
+  // Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
+  // linesizes (strides) may be different than the widths for the input CPU
+  // frame. That's precisely what cudaMemcpy2D is for.
   err = cudaMemcpy2D(
       gpuFrame->data[0],
       gpuFrame->linesize[0],
@@ -771,10 +783,16 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
       "Failed to copy frame properties: ",
       getFFMPEGErrorStringFromErrorCode(ret));
 
+  // We're almost done, but we need to make sure the CUDA memory is freed
+  // properly. Usually, AVFrame data is freed when av_frame_free() is called
+  // (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
+  // ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
+  // with this is to associate the opaque_ref field of the AVFrame with a `free`
+  // callback that will then be called by av_frame_free().
   gpuFrame->opaque_ref = av_buffer_create(
-      nullptr, // data
+      nullptr, // data - we don't need any
       0, // data size
-      cudaBufferFreeCallback,  // callback triggered by av_frame_free()
+      cudaBufferFreeCallback, // callback triggered by av_frame_free()
       cudaBuffer, // parameter to callback
       0); // flags
   TORCH_CHECK(
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index 1fc97453c..ac94e55fd 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -81,7 +81,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
       unsigned int pitch,
       const CUVIDPARSERDISPINFO& dispInfo);
 
-  // Convert CPU frame to GPU NV12 frame for GPU color conversion
   UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame);
 
   CUvideoparser videoParser_ = nullptr;
@@ -100,11 +99,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   // NPP context for color conversion
   UniqueNppContext nppCtx_;
 
-  // Swscale context caching for CPU->GPU NV12 conversion
+  std::unique_ptr<DeviceInterface> cpuFallback_;
   UniqueSwsContext swsContext_;
   SwsFrameContext prevSwsFrameContext_;
-
-  std::unique_ptr<DeviceInterface> cpuFallback_;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
index 68bd7fd02..2d58abfb2 100644
--- a/src/torchcodec/_core/FFMPEGCommon.h
+++ b/src/torchcodec/_core/FFMPEGCommon.h
@@ -104,34 +104,6 @@ using UniqueAVBufferSrcParameters = std::unique_ptr<
     AVBufferSrcParameters,
     Deleterv<AVBufferSrcParameters, void, av_freep>>;
 
-// Common swscale context management for efficient reuse across device
-// interfaces
-struct SwsFrameContext {
-  int inputWidth = 0;
-  int inputHeight = 0;
-  AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
-  int outputWidth = 0;
-  int outputHeight = 0;
-
-  SwsFrameContext() = default;
-  SwsFrameContext(
-      int inputWidth,
-      int inputHeight,
-      AVPixelFormat inputFormat,
-      int outputWidth,
-      int outputHeight);
-
-  bool operator==(const SwsFrameContext& other) const;
-  bool operator!=(const SwsFrameContext& other) const;
-};
-
-// Utility functions for swscale context management
-UniqueSwsContext createSwsContext(
-    const SwsFrameContext& swsFrameContext,
-    AVColorSpace colorspace,
-    AVPixelFormat outputFormat = AV_PIX_FMT_RGB24,
-    int swsFlags = SWS_BILINEAR);
-
 // These 2 classes share the same underlying AVPacket object. They are meant to
 // be used in tandem, like so:
 //
@@ -279,4 +251,30 @@ AVFilterContext* createBuffersinkFilter(
     AVFilterGraph* filterGraph,
     enum AVPixelFormat outputFormat);
 
+struct SwsFrameContext {
+  int inputWidth = 0;
+  int inputHeight = 0;
+  AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
+  int outputWidth = 0;
+  int outputHeight = 0;
+
+  SwsFrameContext() = default;
+  SwsFrameContext(
+      int inputWidth,
+      int inputHeight,
+      AVPixelFormat inputFormat,
+      int outputWidth,
+      int outputHeight);
+
+  bool operator==(const SwsFrameContext& other) const;
+  bool operator!=(const SwsFrameContext& other) const;
+};
+
+// Utility functions for swscale context management
+UniqueSwsContext createSwsContext(
+    const SwsFrameContext& swsFrameContext,
+    AVColorSpace colorspace,
+    AVPixelFormat outputFormat = AV_PIX_FMT_RGB24,
+    int swsFlags = SWS_BILINEAR);
+
 } // namespace facebook::torchcodec

From 3afc97f1d4b54c9ad52fde8bca3374299358de8b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Fri, 24 Oct 2025 18:47:19 +0100
Subject: [PATCH 08/10] Add test

---
 test/test_decoders.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/test/test_decoders.py b/test/test_decoders.py
index ddd2e9189..5f0f05668 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1709,17 +1709,23 @@ def test_beta_cuda_interface_cpu_fallback(self):
         # fallbacks to the CPU path in such cases. We assert that we fall back
         # to the CPU path, too.
 
-        ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
-        with set_cuda_backend("beta"):
-            beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
+        ref_dec = VideoDecoder(H265_VIDEO.path, device="cuda")
+        ref_frames = ref_dec.get_frame_at(0)
+        assert (
+            _core._get_backend_details(ref_dec._decoder)
+            == "FFmpeg CUDA Device Interface. Using CPU fallback."
+        )
 
-        from torchvision.io import write_png
-        from torchvision.utils import make_grid
+        with set_cuda_backend("beta"):
+            beta_dec = VideoDecoder(H265_VIDEO.path, device="cuda")
 
-        write_png(make_grid([ffmpeg.data, beta.data], nrow=2).cpu(), "out.png")
+        assert (
+            _core._get_backend_details(beta_dec._decoder)
+            == "Beta CUDA Device Interface. Using CPU fallback."
+        )
+        beta_frame = beta_dec.get_frame_at(0)
 
-        assert psnr(ffmpeg.data.cpu(), beta.data.cpu()) > 25
-        # torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0)
+        assert psnr(ref_frames.data.cpu(), beta_frame.data.cpu()) > 25
 
     @needs_cuda
     def test_beta_cuda_interface_error(self):

From 89f1547ab596bd3c4d57b284120253f4721d6c13 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Sat, 25 Oct 2025 01:36:37 +0100
Subject: [PATCH 09/10] remove .cpu() call

---
 test/test_decoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_decoders.py b/test/test_decoders.py
index 5f0f05668..5e5028da6 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1725,7 +1725,7 @@ def test_beta_cuda_interface_cpu_fallback(self):
         )
         beta_frame = beta_dec.get_frame_at(0)
 
-        assert psnr(ref_frames.data.cpu(), beta_frame.data.cpu()) > 25
+        assert psnr(ref_frames.data, beta_frame.data) > 25
 
     @needs_cuda
     def test_beta_cuda_interface_error(self):

From 61d4c7fc1ea490bfa501b4c0beacc5436a81a20d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Wed, 29 Oct 2025 17:34:04 +0000
Subject: [PATCH 10/10] Add evenness checks before dividing by 2

---
 src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 8437f3d17..587456f34 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -729,6 +729,9 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
       convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");
 
   int ySize = width * height;
+  TORCH_CHECK(
+      ySize % 2 == 0,
+      "Y plane size must be even. Please report on TorchCodec repo.");
   int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
   size_t totalSize = static_cast<size_t>(ySize + uvSize);
 
@@ -767,6 +770,9 @@ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
       "Failed to copy Y plane to GPU: ",
       cudaGetErrorString(err));
 
+  TORCH_CHECK(
+      height % 2 == 0,
+      "height must be even. Please report on TorchCodec repo.");
   err = cudaMemcpy2D(
       gpuFrame->data[1],
       gpuFrame->linesize[1],