meta-pytorch · Dan-Flores · Jan 9, 2026 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -383,16 +383,120 @@ std::string CudaDeviceInterface::getDetails() {
 // Below are methods exclusive to video encoding:
 // --------------------------------------------------------------------------
 namespace {
-// RGB to NV12 color conversion matrix for BT.601 limited range.
-// NPP ColorTwist function used below expects the limited range
-// color conversion matrix, and this matches FFmpeg's default behavior.
-const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
-    // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
-    {0.257f, 0.504f, 0.098f, 16.0f},
-    // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
-    {-0.148f, -0.291f, 0.439f, 128.0f},
-    // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
-    {0.439f, -0.368f, -0.071f, 128.0f}};
+// Note: [RGB -> YUV Color Conversion, limited color range]
+//
+// For context on this subject, first read the note:
+// [YUV -> RGB Color Conversion, color space and color range]
+// https://github.com/meta-pytorch/torchcodec/blob/main/src/torchcodec/_core/CUDACommon.cpp#L63-L65
+//
+// Lets encode RGB -> YUV in the limited color range for BT.601 color space.
+// In limited range, the [0, 255] range is mapped into [16-235] for Y, and into
+// [16-240] for U,V.
+// To implement, we get the full range conversion matrix as before, then scale:
+// - Y channel: scale by (235-16)/255 = 219/255
+// - U,V channels: scale by (240-16)/255 = 224/255
+// https://en.wikipedia.org/wiki/YCbCr#Y%E2%80%B2PbPr_to_Y%E2%80%B2CbCr
+//
+// ```py
+// import torch
+// kr, kg, kb = 0.299, 0.587, 0.114  # BT.601 luma coefficients
+// u_scale = 2 * (1 - kb)
+// v_scale = 2 * (1 - kr)
+//
+// rgb_to_yuv_full = torch.tensor([
+//     [kr, kg, kb],
+//     [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale],
+//     [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale]
+// ])
+//
+// full_to_limited_y_scale = 219.0 / 255.0
+// full_to_limited_uv_scale = 224.0 / 255.0
+//
+// rgb_to_yuv_limited = rgb_to_yuv_full * torch.tensor([
+//     [full_to_limited_y_scale],
+//     [full_to_limited_uv_scale],
+//     [full_to_limited_uv_scale]
+// ])
+//
+// print("RGB->YUV matrix (Limited Range BT.601):")
+// print(rgb_to_yuv_limited)
+// ```
+//
+// This yields:
+// tensor([[ 0.2568,  0.5041,  0.0979],
+//         [-0.1482, -0.2910,  0.4392],
+//         [ 0.4392, -0.3678, -0.0714]])
+//
+// Which matches https://fourcc.org/fccyvrgb.php
+//
+// To perform color conversion in NPP, we are required to provide these color
+// conversion matrices to ColorTwist functions, for example,
+// `nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx`.
+// https://docs.nvidia.com/cuda/npp/image_color_conversion.html
+//
+// These offsets are added in the 4th column of each conversion matrix below.
+// - In limited range, Y is offset by 16 to add the lower margin.
+// - In both color ranges, U,V are offset by 128 to be centered around 0.
+//
+// RGB to YUV conversion matrices to use in NPP color conversion functions
+struct ColorConversionMatrices {
+  static constexpr Npp32f BT601_LIMITED[3][4] = {
+      {0.2568f, 0.5041f, 0.0979f, 16.0f},
+      {-0.1482f, -0.2910f, 0.4392f, 128.0f},
+      {0.4392f, -0.3678f, -0.0714f, 128.0f}};
+
+  static constexpr Npp32f BT601_FULL[3][4] = {
+      {0.2990f, 0.5870f, 0.1140f, 0.0f},
+      {-0.1687f, -0.3313f, 0.5000f, 128.0f},
+      {0.5000f, -0.4187f, -0.0813f, 128.0f}};
+
+  static constexpr Npp32f BT709_LIMITED[3][4] = {
+      {0.1826f, 0.6142f, 0.0620f, 16.0f},
+      {-0.1006f, -0.3386f, 0.4392f, 128.0f},
+      {0.4392f, -0.3989f, -0.0403f, 128.0f}};
+
+  static constexpr Npp32f BT709_FULL[3][4] = {
+      {0.2126f, 0.7152f, 0.0722f, 0.0f},
+      {-0.1146f, -0.3854f, 0.5000f, 128.0f},
+      {0.5000f, -0.4542f, -0.0458f, 128.0f}};
+
+  static constexpr Npp32f BT2020_LIMITED[3][4] = {
+      {0.2256f, 0.5823f, 0.0509f, 16.0f},
+      {-0.1227f, -0.3166f, 0.4392f, 128.0f},
+      {0.4392f, -0.4039f, -0.0353f, 128.0f}};
+
+  static constexpr Npp32f BT2020_FULL[3][4] = {
+      {0.2627f, 0.6780f, 0.0593f, 0.0f},
+      {-0.139630f, -0.360370f, 0.5000f, 128.0f},
+      {0.5000f, -0.459786f, -0.040214f, 128.0f}};
+};
+
+// Returns conversion matrix based on codec context color space and range
+const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] {
+  if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range
+      codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) {
+    if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
+      return ColorConversionMatrices::BT601_LIMITED;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT709) {
+      return ColorConversionMatrices::BT709_LIMITED;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
+      return ColorConversionMatrices::BT2020_LIMITED;
+    } else { // default to BT.601
+      return ColorConversionMatrices::BT601_LIMITED;
+    }
+  } else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range
+    if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
+      return ColorConversionMatrices::BT601_FULL;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT709) {
+      return ColorConversionMatrices::BT709_FULL;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
+      return ColorConversionMatrices::BT2020_FULL;
+    } else { // default to BT.601
+      return ColorConversionMatrices::BT601_FULL;
+    }
+  }
+  return ColorConversionMatrices::BT601_LIMITED;
+}
 } // namespace
 
 UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
@@ -437,26 +541,26 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
   torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
 
   NppiSize oSizeROI = {width, height};
-  NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
+  NppStatus status;
+  // Convert to NV12, as CUDA_ENCODING_PIXEL_FORMAT is always NV12 currently
+  status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
       static_cast<const Npp8u*>(hwcFrame.data_ptr()),
-      validateInt64ToInt(
-          hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"),
+      hwcFrame.stride(0) * hwcFrame.element_size(),
       avFrame->data,
       avFrame->linesize,
       oSizeROI,
-      defaultLimitedRangeRgbToNv12,
+      getConversionMatrix(codecContext),
       *nppCtx_);
 
   TORCH_CHECK(
       status == NPP_SUCCESS,
-      "Failed to convert RGB to NV12: NPP error code ",
+      "Failed to convert RGB to ",
+      av_get_pix_fmt_name(DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT),
+      ": NPP error code ",
       status);
 
-  // TODO-VideoEncoder: Enable configuration of color properties, similar to
-  // FFmpeg. Below are the default color properties used by FFmpeg.
-  avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
-  avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
-
+  avFrame->colorspace = codecContext->colorspace;
+  avFrame->color_range = codecContext->color_range;
   return avFrame;
 }
 
@@ -474,9 +578,7 @@ void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
       hwFramesCtxRef != nullptr,
       "Failed to allocate hardware frames context for codec");
 
-  // TODO-VideoEncoder: Enable user set pixel formats to be set
-  // (outPixelFormat_) and handled with the appropriate NPP function
-  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
+  codecContext->sw_pix_fmt = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT;
   // Always set pixel format to support CUDA encoding.
   codecContext->pix_fmt = AV_PIX_FMT_CUDA;
 

diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -139,6 +139,9 @@ class DeviceInterface {
     return "";
   }
 
+  // Pixel format used for encoding on CUDA devices
+  static constexpr AVPixelFormat CUDA_ENCODING_PIXEL_FORMAT = AV_PIX_FMT_NV12;
+
   // Function used for video encoding, only implemented in CudaDeviceInterface.
   // It is here to isolate CUDA dependencies from CPU builds
   // TODO Video-Encoder: Reconsider using video encoding functions in device

diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -782,23 +782,30 @@ void VideoEncoder::initializeEncoder(
   outHeight_ = inHeight_;
 
   if (videoStreamOptions.pixelFormat.has_value()) {
+    // TODO-VideoEncoder: Enable pixel formats to be set by user
+    // and handled with the appropriate NPP function on GPU.
     if (frames_.device().is_cuda()) {
       TORCH_CHECK(
           false,
-          "GPU Video encoding currently only supports the NV12 pixel format. "
-          "Do not set pixel_format to use NV12.");
+          "Video encoding on GPU currently only supports the nv12 pixel format. "
+          "Do not set pixel_format to use nv12 by default.");
     }
     outPixelFormat_ =
         validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
   } else {
-    const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
-    // Use first listed pixel format as default (often yuv420p).
-    // This is similar to FFmpeg's logic:
-    // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
-    // If pixel formats are undefined for some reason, try yuv420p
-    outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
-        ? formats[0]
-        : AV_PIX_FMT_YUV420P;
+    if (frames_.device().is_cuda()) {
+      // Default to nv12 pixel format when encoding on GPU.
+      outPixelFormat_ = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT;
+    } else {
+      const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
+      // Use first listed pixel format as default (often yuv420p).
+      // This is similar to FFmpeg's logic:
+      // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
+      // If pixel formats are undefined for some reason, try yuv420p
+      outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
+          ? formats[0]
+          : AV_PIX_FMT_YUV420P;
+    }
   }
 
   // Configure codec parameters

diff --git a/test/test_encoders.py b/test/test_encoders.py
@@ -777,9 +777,9 @@ def test_pixel_format_errors(self, method, device, tmp_path):
         if device == "cuda":
             with pytest.raises(
                 RuntimeError,
-                match="GPU Video encoding currently only supports the NV12 pixel format. Do not set pixel_format to use NV12",
+                match="Video encoding on GPU currently only supports the nv12 pixel format. Do not set pixel_format to use nv12 by default.",
             ):
-                getattr(encoder, method)(**valid_params, pixel_format="yuv420p")
+                getattr(encoder, method)(**valid_params, pixel_format="yuv444p")
             return
 
         with pytest.raises(
@@ -1345,13 +1345,37 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
             pytest.param(
                 "mkv",
                 "av1_nvenc",
-                marks=pytest.mark.skipif(
-                    IN_GITHUB_CI, reason="av1_nvenc is not supported on CI"
-                ),
+                marks=[
+                    pytest.mark.skipif(
+                        IN_GITHUB_CI, reason="av1_nvenc is not supported on CI"
+                    ),
+                    pytest.mark.skipif(
+                        get_ffmpeg_major_version() == 4,
+                        reason="av1_nvenc is not supported on FFmpeg 4",
+                    ),
+                ],
             ),
         ],
     )
-    def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
+    # We test the color space and color range parameters in this test, because
+    # we are required to define matrices specific to these specs when using NPP, see note:
+    # [RGB -> YUV Color Conversion, limited color range]
+    # BT.601, BT.709, BT.2020
+    @pytest.mark.parametrize("color_space", ("bt470bg", "bt709", "bt2020nc", None))
+    # Full/PC range, Limited/TV range
+    @pytest.mark.parametrize("color_range", ("pc", "tv", None))
+    def test_nvenc_against_ffmpeg_cli(
+        self, tmp_path, method, format, codec, color_space, color_range
+    ):
+        ffmpeg_version = get_ffmpeg_major_version()
+        # TODO-VideoEncoder: Investigate why FFmpeg 4 and 6 fail with non-default color space and range.
+        # See https://github.com/meta-pytorch/torchcodec/issues/1140
+        if ffmpeg_version in (4, 6) and not (
+            color_space == "bt470bg" and color_range == "tv"
+        ):
+            pytest.skip(
+                "Non-default color space and range have lower accuracy on FFmpeg 4 and 6"
+            )
         # Encode with FFmpeg CLI using nvenc codecs
         device = "cuda"
         qp = 1  # Use near lossless encoding to reduce noise and support av1_nvenc
@@ -1379,16 +1403,23 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
             temp_raw_path,
         ]
         # CLI requires explicit codec for nvenc
+        # VideoEncoder will default to h264_nvenc since the frames are on GPU.
         ffmpeg_cmd.extend(["-c:v", codec if codec is not None else "h264_nvenc"])
-        # VideoEncoder will select an NVENC encoder by default since the frames are on GPU.
-
         ffmpeg_cmd.extend(["-pix_fmt", "nv12"])  # Output format is always NV12
         ffmpeg_cmd.extend(["-qp", str(qp)])
+        if color_space:
+            ffmpeg_cmd.extend(["-colorspace", color_space])
+        if color_range:
+            ffmpeg_cmd.extend(["-color_range", color_range])
         ffmpeg_cmd.extend([ffmpeg_encoded_path])
         subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
 
         encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate)
         encoder_extra_options = {"qp": qp}
+        if color_space:
+            encoder_extra_options["colorspace"] = color_space
+        if color_range:
+            encoder_extra_options["color_range"] = color_range
         if method == "to_file":
             encoder_output_path = str(tmp_path / f"nvenc_output.{format}")
             encoder.to_file(
@@ -1424,8 +1455,39 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
             assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=96, atol=2)
 
         if method == "to_file":
-            ffmpeg_metadata = self._get_video_metadata(ffmpeg_encoded_path, ["pix_fmt"])
-            encoder_metadata = self._get_video_metadata(encoder_output, ["pix_fmt"])
-            # pix_fmt nv12 is stored as yuv420p in metadata
-            assert encoder_metadata["pix_fmt"] == "yuv420p"
-            assert ffmpeg_metadata["pix_fmt"] == "yuv420p"
+            metadata_fields = ["pix_fmt", "color_range", "color_space"]
+            ffmpeg_metadata = self._get_video_metadata(
+                ffmpeg_encoded_path, metadata_fields
+            )
+            encoder_metadata = self._get_video_metadata(encoder_output, metadata_fields)
+            # pix_fmt nv12 is stored as yuv420p in metadata, unless full range (pc)is used
+            # In that case, h264 and hevc NVENC codecs will use yuvj420p automatically.
+            if color_range == "pc" and codec != "av1_nvenc":
+                expected_pix_fmt = "yuvj420p"
+            else:
+                # av1_nvenc does not utilize the yuvj420p pixel format
+                expected_pix_fmt = "yuv420p"
+            assert (
+                encoder_metadata["pix_fmt"]
+                == ffmpeg_metadata["pix_fmt"]
+                == expected_pix_fmt
+            )
+
+            assert encoder_metadata["color_range"] == ffmpeg_metadata["color_range"]
+            assert encoder_metadata["color_space"] == ffmpeg_metadata["color_space"]
+            # Default values vary by codec, so we only assert when
+            # color_range and color_space are not None.
+            if color_range is not None:
+                # FFmpeg and torchcodec encode color_range as 'unknown' for mov and avi
+                # when color_range='tv' and color_space=None on FFmpeg 7/8.
+                # Since this failure is rare, I suspect its a bug related to these
+                # older container formats on newer FFmpeg versions.
+                if not (
+                    ffmpeg_version in (7, 8)
+                    and color_range == "tv"
+                    and color_space is None
+                    and format in ("mov", "avi")
+                ):
+                    assert color_range == encoder_metadata["color_range"]
+            if color_space is not None:
+                assert color_space == encoder_metadata["color_space"]