meta-pytorch · NicolasHug · Jul 13, 2025 · Jul 13, 2025 · Jul 26, 2025 · Jul 27, 2025
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -42,6 +42,10 @@ git clone [email protected]:pytorch/torchcodec.git
 # Or, using https instead of ssh: git clone https://github.com/pytorch/torchcodec.git
 cd torchcodec
 
+# Optional, but recommended: define a persistent build directory which speeds-up
+# subsequent builds.
+export TORCHCODEC_CMAKE_BUILD_DIR="${PWD}/build"
+
 pip install -e ".[dev]" --no-build-isolation -vv
 # Or, for cuda support: ENABLE_CUDA=1 pip install -e ".[dev]" --no-build-isolation -vv
 ```

diff --git a/setup.py b/setup.py
@@ -126,12 +126,17 @@ def _build_all_extensions_with_cmake(self):
             f"-DTORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR={torchcodec_disable_compile_warning_as_error}",
         ]
 
+        self.build_temp = os.getenv("TORCHCODEC_CMAKE_BUILD_DIR", self.build_temp)
+        print(f"Using {self.build_temp = }", flush=True)
         Path(self.build_temp).mkdir(parents=True, exist_ok=True)
 
+        print("Calling cmake (configure)", flush=True)
         subprocess.check_call(
             ["cmake", str(_ROOT_DIR)] + cmake_args, cwd=self.build_temp
         )
+        print("Calling cmake --build", flush=True)
         subprocess.check_call(["cmake", "--build", "."], cwd=self.build_temp)
+        print("Calling cmake --install", flush=True)
         subprocess.check_call(["cmake", "--install", "."], cwd=self.build_temp)
 
     def copy_extensions_to_source(self):

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -196,54 +196,171 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  // We check that avFrame->format == AV_PIX_FMT_CUDA. This only ensures the
+  // AVFrame is on GPU memory. It can be on CPU memory if the video isn't
+  // supported by NVDEC for whatever reason: NVDEC falls back to CPU decoding in
+  // this case, and our check fails.
+  // TODO: we could send the frame back into the CPU path, and rely on
+  // swscale/filtergraph to run the color conversion to properly output the
+  // frame.
   TORCH_CHECK(
       avFrame->format == AV_PIX_FMT_CUDA,
-      "Expected format to be AV_PIX_FMT_CUDA, got " +
-          std::string(av_get_pix_fmt_name((AVPixelFormat)avFrame->format)));
+      "Expected format to be AV_PIX_FMT_CUDA, got ",
+      (av_get_pix_fmt_name((AVPixelFormat)avFrame->format)
+           ? av_get_pix_fmt_name((AVPixelFormat)avFrame->format)
+           : "unknown"),
+      ". When that happens, it is probably because the video is not supported by NVDEC. "
+      "Try using the CPU device instead. "
+      "If the video is 10bit, we are tracking 10bit support in "
+      "https://github.com/pytorch/torchcodec/issues/776");
+
+  // Above we checked that the AVFrame was on GPU, but that's not enough, we
+  // also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits),
+  // because this is what the NPP color conversion routines expect.
+  // TODO: we should investigate how to can perform color conversion for
+  // non-8bit videos. This is supported on CPU.
+  TORCH_CHECK(
+      avFrame->hw_frames_ctx != nullptr,
+      "The AVFrame does not have a hw_frames_ctx. "
+      "That's unexpected, please report this to the TorchCodec repo.");
+
+  AVPixelFormat actualFormat =
+      reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data)
+          ->sw_format;
+  TORCH_CHECK(
+      actualFormat == AV_PIX_FMT_NV12 || actualFormat == AV_PIX_FMT_P010LE,
+      "The AVFrame is ",
+      (av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
+                                         : "unknown"),
+      ", but we expected AV_PIX_FMT_NV12 or AV_PIX_FMT_P010LE. "
+      "Try using the CPU device instead.");
+
   auto frameDims =
       getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
   int height = frameDims.height;
   int width = frameDims.width;
   torch::Tensor& dst = frameOutput.data;
-  if (preAllocatedOutputTensor.has_value()) {
-    dst = preAllocatedOutputTensor.value();
-    auto shape = dst.sizes();
-    TORCH_CHECK(
-        (shape.size() == 3) && (shape[0] == height) && (shape[1] == width) &&
-            (shape[2] == 3),
-        "Expected tensor of shape ",
-        height,
-        "x",
-        width,
-        "x3, got ",
-        shape);
+  torch::Tensor intermediateTensor;
+
+  if (actualFormat == AV_PIX_FMT_P010LE) {
+    // For 10-bit, we need a 16-bit intermediate tensor, then convert to 8-bit
+    intermediateTensor = torch::empty(
+        {height, width, 3},
+        torch::TensorOptions().dtype(torch::kUInt16).device(device_));
+
+    if (preAllocatedOutputTensor.has_value()) {
+      dst = preAllocatedOutputTensor.value();
+    } else {
+      dst = allocateEmptyHWCTensor(height, width, device_);
+    }
   } else {
-    dst = allocateEmptyHWCTensor(height, width, device_);
+    // For 8-bit formats, use the output tensor directly
+    if (preAllocatedOutputTensor.has_value()) {
+      dst = preAllocatedOutputTensor.value();
+      auto shape = dst.sizes();
+      TORCH_CHECK(
+          (shape.size() == 3) && (shape[0] == height) && (shape[1] == width) &&
+              (shape[2] == 3),
+          "Expected tensor of shape ",
+          height,
+          "x",
+          width,
+          "x3, got ",
+          shape);
+    } else {
+      dst = allocateEmptyHWCTensor(height, width, device_);
+    }
   }
 
   // Use the user-requested GPU for running the NPP kernel.
   c10::cuda::CUDAGuard deviceGuard(device_);
 
   NppiSize oSizeROI = {width, height};
-  Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};
-
   NppStatus status;
-  if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
-    status = nppiNV12ToRGB_709CSC_8u_P2C3R(
-        input,
-        avFrame->linesize[0],
-        static_cast<Npp8u*>(dst.data_ptr()),
-        dst.stride(0),
-        oSizeROI);
-  } else {
-    status = nppiNV12ToRGB_8u_P2C3R(
+
+  if (actualFormat == AV_PIX_FMT_NV12) {
+    // 8-bit NV12 format
+    Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};
+
+    if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
+      status = nppiNV12ToRGB_709CSC_8u_P2C3R(
+          input,
+          avFrame->linesize[0],
+          static_cast<Npp8u*>(dst.data_ptr()),
+          dst.stride(0),
+          oSizeROI);
+    } else {
+      status = nppiNV12ToRGB_8u_P2C3R(
+          input,
+          avFrame->linesize[0],
+          static_cast<Npp8u*>(dst.data_ptr()),
+          dst.stride(0),
+          oSizeROI);
+    }
+  } else if (actualFormat == AV_PIX_FMT_P010LE) {
+    // 10-bit semi-planar format (like NV12 but 16-bit)
+    // P010LE has Y plane + interleaved UV plane, 10-bit data in high bits
+    const Npp16u* input[2] = {
+        reinterpret_cast<const Npp16u*>(avFrame->data[0]), // Y plane (16-bit)
+        reinterpret_cast<const Npp16u*>(
+            avFrame->data[1]) // UV plane (16-bit interleaved)
+    };
+
+    // Choose color matrix based on colorspace
+    const Npp32f (*aTwist)[4];
+
+    // TODO use even more accurage values from
+    // https://ffmpeg.org/doxygen/trunk/yuv2rgb_8c_source.html#l00047
+    // Need to devide by 65536 to get the floats
+    // BT.709 matrix (HDTV)
+    static const Npp32f bt709Matrix[3][4] = {
+        {1.0f, 0.0f, 1.402f, 0.0f},
+        {1.0f, -0.344136f, -0.714136f, -32768.0f},
+        {1.0f, 1.772f, 0.0f, -32768.0f}
+    };
+
+    // BT.601 matrix (SDTV)
+    static const Npp32f bt601Matrix[3][4] = {
+        {1.0f, 0.0f, 1.596f, 0.0f},
+        {1.0f, -0.392f, -0.813f, -32768.0f},
+        {1.0f, 2.017f, 0.0f, -32768.0f}
+    };
+
+    if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
+        printf("It's BT.709 colorspace\n");
+        aTwist = bt709Matrix;
+    } else {
+        // Default to BT.601 for other colorspaces (including AVCOL_SPC_BT470BG, AVCOL_SPC_SMPTE170M)
+        printf("It's BT.601 colorspace\n");
+        aTwist = bt601Matrix;
+    }
+
+    // Create NPP stream context
+    NppStreamContext nppStreamCtx;
+    nppStreamCtx.hStream = nppGetStream();
+
+    int rSrcStep[2] = {
+        avFrame->linesize[0], avFrame->linesize[1]}; // Y and UV strides
+
+    status = nppiNV12ToRGB_16u_ColorTwist32f_P2C3R_Ctx(
         input,
-        avFrame->linesize[0],
-        static_cast<Npp8u*>(dst.data_ptr()),
-        dst.stride(0),
-        oSizeROI);
+        rSrcStep,
+        reinterpret_cast<Npp16u*>(intermediateTensor.data_ptr()),
+        intermediateTensor.stride(0) * sizeof(uint16_t),
+        oSizeROI,
+        aTwist,
+        nppStreamCtx);
+
+    // Convert 16-bit to 8-bit: P010LE has 10-bit data, so divide by 4 to
+    // convert to 8-bit
+    if (status == NPP_SUCCESS) {
+      dst =
+          (intermediateTensor.div(256))
+              .to(torch::kUInt8); // Divide by 4 for 10-bit -> 8-bit conversion
+    }
   }
-  TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
+
+  TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert frame.");
 
   // Make the pytorch stream wait for the npp kernel to finish before using the
   // output.

diff --git a/test/resources/h264_10bits.mp4 b/test/resources/h264_10bits.mp4
diff --git a/test/resources/h265_10bits.mp4 b/test/resources/h265_10bits.mp4
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -26,12 +26,15 @@
     AV1_VIDEO,
     cpu_and_cuda,
     get_ffmpeg_major_version,
+    H264_10BITS,
+    H265_10BITS,
     H265_VIDEO,
     in_fbcode,
     NASA_AUDIO,
     NASA_AUDIO_MP3,
     NASA_AUDIO_MP3_44100,
     NASA_VIDEO,
+    needs_cuda,
     SINE_MONO_S16,
     SINE_MONO_S32,
     SINE_MONO_S32_44100,
@@ -1138,6 +1141,38 @@ def test_pts_to_dts_fallback(self, seek_mode):
         with pytest.raises(AssertionError, match="not equal"):
             torch.testing.assert_close(decoder[0], decoder[10])
 
+    @needs_cuda
+    # @pytest.mark.parametrize("asset", (H264_10BITS, H265_10BITS))
+    @pytest.mark.parametrize("asset", (H265_10BITS,))  # H265_10BITS))
+    def test_10bit_videos_cuda(self, asset):
+        # Assert that we raise proper error on different kinds of 10bit videos.
+
+        # TODO we should investigate how to support 10bit videos on GPU.
+        # See https://github.com/pytorch/torchcodec/issues/776
+
+        from torchvision.io import write_png
+
+        decoder = VideoDecoder(asset.path, device="cuda")
+        gpu_frame = decoder.get_frame_at(0)
+        write_png(gpu_frame.data.cpu(), "gpu.png")
+
+        decoder = VideoDecoder(asset.path, device="cpu")
+        cpu_frame = decoder.get_frame_at(0)
+        write_png(cpu_frame.data, "cpu.png")
+
+        if asset is H265_10BITS:
+            match = "The AVFrame is p010le, but we expected AV_PIX_FMT_NV12."
+        else:
+            match = "Expected format to be AV_PIX_FMT_CUDA, got yuv420p10le."
+
+    @pytest.mark.parametrize("asset", (H264_10BITS, H265_10BITS))
+    def test_10bit_videos_cpu(self, asset):
+        # This just validates that we can decode 10-bit videos on CPU.
+        # TODO validate against the ref that the decoded frames are correct
+
+        decoder = VideoDecoder(asset.path)
+        decoder.get_frame_at(10)
+
 
 class TestAudioDecoder:
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32))

diff --git a/test/utils.py b/test/utils.py
@@ -367,6 +367,28 @@ def get_empty_chw_tensor(self, *, stream_index: int) -> torch.Tensor:
     frames={},  # Automatically loaded from json file
 )
 
+# Video generated with:
+# ffmpeg -f lavfi -i testsrc2=duration=1:size=200x200:rate=30 -c:v libx265 -pix_fmt yuv420p10le -preset fast -crf 23 h265_10bits.mp4
+H265_10BITS = TestVideo(
+    filename="h265_10bits.mp4",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=200, height=200, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed yet
+)
+
+# Video generated with:
+# peg -f lavfi -i testsrc2=duration=1:size=200x200:rate=30 -c:v libx264 -pix_fmt yuv420p10le -preset fast -crf 23 h264_10bits.mp4
+H264_10BITS = TestVideo(
+    filename="h264_10bits.mp4",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=200, height=200, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed yet
+)
+
 
 @dataclass
 class TestAudio(TestContainerFile):