From 33ae29577e194713b301993a9c910f421cc78245 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Sun, 5 Oct 2025 11:47:48 +0100
Subject: [PATCH 1/2] Clarify stuff

---
 .../_core/BetaCudaDeviceInterface.cpp         | 20 +++++++++++++++++++
 test/test_decoders.py                         | 20 ++++++++-----------
 2 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 078655462..eb1850dd9 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -108,10 +108,30 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
       " vs supported:",
       caps.nMaxMBCount);
 
+  // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
+  // sure it's actually supported.
+  TORCH_CHECK(
+      caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12),
+      "NV12 output format is not supported for this configuration. ",
+      "Codec: ",
+      static_cast<int>(videoFormat->codec),
+      ", chroma format: ",
+      static_cast<int>(videoFormat->chroma_format),
+      ", bit depth: ",
+      videoFormat->bit_depth_luma_minus8 + 8);
+
   // Decoder creation parameters, most are taken from DALI
   CUVIDDECODECREATEINFO decoderParams = {};
   decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
   decoderParams.ChromaFormat = videoFormat->chroma_format;
+  // We explicitly request NV12 format, which means 10bit videos will be
+  // automatically converted to 8bits by NVDEC itself. That is, the raw frames
+  // we get back from cuvidMapVideoFrame will already be in 8bit format.  We
+  // won't need to do the conversion ourselves, so that's a lot easier.
+  // In the default interface, we have to do the 10 -> 8bits conversion
+  // ourselves later in convertAVFrameToFrameOutput(), because FFmpeg explicitly
+  // requests 10 or 16bits output formats for >8-bit videos!
+  // https://github.com/FFmpeg/FFmpeg/blob/e05f8acabff468c1382277c1f31fa8e9d90c3202/libavcodec/nvdec.c#L376-L403
   decoderParams.OutputFormat = cudaVideoSurfaceFormat_NV12;
   decoderParams.ulCreationFlags = cudaVideoCreate_Default;
   decoderParams.CodecType = videoFormat->codec;
diff --git a/test/test_decoders.py b/test/test_decoders.py
index e5139d089..ca73b99b8 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1288,18 +1288,15 @@ def test_10bit_videos(self, device, asset):
         # This just validates that we can decode 10-bit videos.
         # TODO validate against the ref that the decoded frames are correct
 
-        if device == "cuda:0:beta":
-            # This fails on our BETA interface on asset 0 (only!) with:
+        if device == "cuda:0:beta" and asset is H264_10BITS:
+            # This fails on the BETA interface with:
             #
             # RuntimeError: Codec configuration not supported on this GPU.
             # Codec: 4, chroma format: 1, bit depth: 10
             #
-            # I don't remember but I suspect asset 0 is actually the one that
-            # fallsback to the CPU path on the default CUDA interface (that
-            # would make sense)
-            # We should investigate if and how we could make that fallback
-            # happen for the BETA interface.
-            pytest.skip("TODONVDEC P2 - investigate and unskip")
+            # It works on the default interface because FFmpeg fallsback to the
+            # CPU, while the BETA interface doesn't.
+            pytest.skip("Asset not supported by NVDEC")
 
         decoder = VideoDecoder(asset.path, device=device)
         decoder.get_frame_at(10)
@@ -1674,12 +1671,11 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode):
 
     @needs_cuda
     def test_beta_cuda_interface_small_h265(self):
-        # TODONVDEC P2 investigate why/how the default interface can decode this
-        # video.
+        # Test to illustrate current difference in behavior between the BETA and
+        # the default interface: this video isn't supported by NVDEC, but in the
+        # default interface, FFMPEG fallsback to the CPU while we don't.
 
-        # This is fine on the default interface - why?
         VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
-        # But it fails on the beta interface due to input validation checks, which we took from DALI!
         with pytest.raises(
             RuntimeError,
             match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144",

From 001bbc8062d09f40e6c09362e39500bcaa7fdf86 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Sun, 5 Oct 2025 12:04:19 +0100
Subject: [PATCH 2/2] Clearer bitwise logic

---
 src/torchcodec/_core/BetaCudaDeviceInterface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index eb1850dd9..333b92bde 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -111,7 +111,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
   // sure it's actually supported.
   TORCH_CHECK(
-      caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12),
+      (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1,
       "NV12 output format is not supported for this configuration. ",
       "Codec: ",
       static_cast<int>(videoFormat->codec),