meta-pytorch · Dan-Flores · Jan 5, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/src/torchcodec/_core/CUDACommon.cpp b/src/torchcodec/_core/CUDACommon.cpp
@@ -57,8 +57,7 @@ void initializeCudaContextWithPytorch(const torch::Device& device) {
 // Color space and color range are independent concepts, so we can have a BT.709
 // with full range, and another one with limited range. Same for BT.601.
 //
-// In the first version of this note we'll focus on the full color range. It
-// will later be updated to account for the limited range.
+// First, we'll consider the conversion in the full color range.
 //
 // Color conversion matrix
 // -----------------------
@@ -110,6 +109,50 @@ void initializeCudaContextWithPytorch(const torch::Device& device) {
 //
 // Which matches https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
 //
+//
+// Next, lets consider encoding RGB -> YUV in the limited color range.
+// Y is in the range [16-235] and U,V are in [16-240].
+// The reduced range provides margins for errors during processing.
+// https://en.wikipedia.org/wiki/YCbCr#Y%E2%80%B2PbPr_to_Y%E2%80%B2CbCr
+//
+// To encode RGB -> YUV in limited range, we start with the full range conversion
+// matrix derived above, then scale it to compress into the limited ranges:
+// - RGB [0,255] -> Y [16,235]: compress by (235-16)/255 ≈ 219/255
+// - RGB [0,255] -> U,V [16,240]: compress by (240-16)/255 ≈ 224/255
+//
+// ```py
+// import torch
+// kr, kg, kb = 0.2126, 0.7152, 0.0722  # BT.709 luma coefficients
+// u_scale = 2 * (1 - kb)
+// v_scale = 2 * (1 - kr)
+//
+// rgb_to_yuv_full = torch.tensor([
+//     [kr, kg, kb],
+//     [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale],
+//     [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale]
+// ])
+//
+// full_to_limited_y_scale = 219.0 / 255.0
+// full_to_limited_uv_scale = 224.0 / 255.0
+//
+// rgb_to_yuv_limited = rgb_to_yuv_full * torch.tensor([
+//     [full_to_limited_y_scale],
+//     [full_to_limited_uv_scale],
+//     [full_to_limited_uv_scale]
+// ])
+//
+// print("RGB->YUV matrix (Limited Range BT.709):")
+// print(rgb_to_yuv_limited)
+// ```
+//
+// This yields:
+// tensor([[ 0.1826,  0.6142,  0.0620],
+//         [-0.1006, -0.3386,  0.4392],
+//         [ 0.4392, -0.3989, -0.0403]])
 // Which matches https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion 
 // Which matches https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion 
+//
+// Which is the matrix we store in CudaDeviceInterface.
+// TODO: land PR that adds this matrix in CudaDeviceInterface
+//
 // Color conversion in NPP
 // -----------------------
 // https://docs.nvidia.com/cuda/npp/image_color_conversion.html.
@@ -137,16 +180,16 @@ void initializeCudaContextWithPytorch(const torch::Device& device) {
 //   the decoder.
 // - But *internally*, `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` needs U and V to
 //   be centered around 0, i.e. in [-128, 127]. So we need to apply a -128
-//   offset to U and V. Y doesn't need to be offset. The offset can be applied
-//   by adding a 4th column to the matrix.
+//   offset to U and V. Y needs an offset of -16, only when using limited range.
+//   The offsets can be applied by adding a 4th column to the matrix.
 //
 //
 // So our conversion matrix becomes the following, with new offset column:
 // tensor([[ 1.0000e+00, -3.3142e-09,  1.5748e+00,     0]
 //         [ 1.0000e+00, -1.8732e-01, -4.6812e-01,     -128]
 //         [ 1.0000e+00,  1.8556e+00,  4.6231e-09 ,    -128]])
 //
-// And that's what we need to pass for BT701, full range.
+// And that's what we need to pass for BT709, full range.
 /* clang-format on */
 
 // BT.709 full range color conversion matrix for YUV to RGB conversion.