@@ -20,6 +20,13 @@ static bool g_cuda =
20
20
return new CudaDeviceInterface (device);
21
21
});
22
22
23
+ // BT.709 full range color conversion matrix for YUV to RGB conversion.
24
+ // See Note [YUV -> RGB Color Conversion, color space and color range] below.
25
+ constexpr Npp32f bt709FullRangeColorTwist[3 ][4 ] = {
26
+ {1 .0f , 0 .0f , 1 .5748f , 0 .0f },
27
+ {1 .0f , -0 .187324273f , -0 .468124273f , -128 .0f },
28
+ {1 .0f , 1 .8556f , 0 .0f , -128 .0f }};
29
+
23
30
// We reuse cuda contexts across VideoDeoder instances. This is because
24
31
// creating a cuda context is expensive. The cache mechanism is as follows:
25
32
// 1. There is a cache of size MAX_CONTEXTS_PER_GPU_IN_CACHE cuda contexts for
@@ -312,21 +319,54 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
312
319
static_cast <int >(getFFMPEGCompatibleDeviceIndex (device_)));
313
320
314
321
NppiSize oSizeROI = {width, height};
315
- Npp8u* input [2 ] = {avFrame->data [0 ], avFrame->data [1 ]};
322
+ Npp8u* yuvData [2 ] = {avFrame->data [0 ], avFrame->data [1 ]};
316
323
317
324
NppStatus status;
318
325
326
+ // For background, see
327
+ // Note [YUV -> RGB Color Conversion, color space and color range]
319
328
if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
320
- status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx (
321
- input,
322
- avFrame->linesize [0 ],
323
- static_cast <Npp8u*>(dst.data_ptr ()),
324
- dst.stride (0 ),
325
- oSizeROI,
326
- nppCtx);
329
+ if (avFrame->color_range == AVColorRange::AVCOL_RANGE_JPEG) {
330
+ // NPP provides a pre-defined color conversion function for BT.709 full
331
+ // range: nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx. But it's not closely
332
+ // matching the results we have on CPU. So we're using a custom color
333
+ // conversion matrix, which provides more accurate results. See the note
334
+ // mentioned above for details, and headaches.
335
+
336
+ int srcStep[2 ] = {avFrame->linesize [0 ], avFrame->linesize [1 ]};
337
+
338
+ status = nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx (
339
+ yuvData,
340
+ srcStep,
341
+ static_cast <Npp8u*>(dst.data_ptr ()),
342
+ dst.stride (0 ),
343
+ oSizeROI,
344
+ bt709FullRangeColorTwist,
345
+ nppCtx);
346
+ } else {
347
+ // If not full range, we assume studio limited range.
348
+ // The color conversion matrix for BT.709 limited range should be:
349
+ // static const Npp32f bt709LimitedRangeColorTwist[3][4] = {
350
+ // {1.16438356f, 0.0f, 1.79274107f, -16.0f},
351
+ // {1.16438356f, -0.213248614f, -0.5329093290f, -128.0f},
352
+ // {1.16438356f, 2.11240179f, 0.0f, -128.0f}
353
+ // };
354
+ // We get very close results to CPU with that, but using the pre-defined
355
+ // nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx seems to be even more accurate.
356
+ status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx (
357
+ yuvData,
358
+ avFrame->linesize [0 ],
359
+ static_cast <Npp8u*>(dst.data_ptr ()),
360
+ dst.stride (0 ),
361
+ oSizeROI,
362
+ nppCtx);
363
+ }
327
364
} else {
365
+ // TODO we're assuming BT.601 color space (and probably limited range) by
366
+ // calling nppiNV12ToRGB_8u_P2C3R_Ctx. We should handle BT.601 full range,
367
+ // and other color-spaces like 2020.
328
368
status = nppiNV12ToRGB_8u_P2C3R_Ctx (
329
- input ,
369
+ yuvData ,
330
370
avFrame->linesize [0 ],
331
371
static_cast <Npp8u*>(dst.data_ptr ()),
332
372
dst.stride (0 ),
@@ -362,3 +402,123 @@ std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
362
402
}
363
403
364
404
} // namespace facebook::torchcodec
405
+
406
+ /* clang-format off */
407
+ // Note: [YUV -> RGB Color Conversion, color space and color range]
408
+ //
409
+ // The frames we get from the decoder (FFmpeg decoder, or NVCUVID) are in YUV
410
+ // format. We need to convert them to RGB. This note attempts to describe this
411
+ // process. There may be some inaccuracies and approximations that experts will
412
+ // notice, but our goal is only to provide a good enough understanding of the
413
+ // process for torchcodec developers to implement and maintain it.
414
+ // On CPU, filtergraph and swscale handle everything for us. With CUDA, we have
415
+ // to do a lot of the heavy lifting ourselves.
416
+ //
417
+ // Color space and color range
418
+ // ---------------------------
419
+ // Two main characteristics of a frame will affect the conversion process:
420
+ // 1. Color space: This basically defines what YUV values correspond to which
421
+ // physical wavelength. No need to go into details here,the point is that
422
+ // videos can come in different color spaces, the most common ones being
423
+ // BT.601 and BT.709, but there are others.
424
+ // In FFmpeg this is represented with AVColorSpace:
425
+ // https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#aff71a069509a1ad3ff54d53a1c894c85
426
+ // 2. Color range: This defines the range of YUV values. There is:
427
+ // - full range, also called PC range: AVCOL_RANGE_JPEG
428
+ // - and the "limited" range, also called studio or TV range: AVCOL_RANGE_MPEG
429
+ // https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#a3da0bf691418bc22c4bcbe6583ad589a
430
+ //
431
+ // Color space and color range are independent concepts, so we can have a BT.709
432
+ // with full range, and another one with limited range. Same for BT.601.
433
+ //
434
+ // In the first version of this note we'll focus on the full color range. It
435
+ // will later be updated to account for the limited range.
436
+ //
437
+ // Color conversion matrix
438
+ // -----------------------
439
+ // YUV -> RGB conversion is defined as the reverse process of the RGB -> YUV,
440
+ // So this is where we'll start.
441
+ // At the core of a RGB -> YUV conversion are the "luma coefficients", which are
442
+ // specific to a given color space and defined by the color space standard. In
443
+ // FFmpeg they can be found here:
444
+ // https://github.com/FFmpeg/FFmpeg/blob/7d606ef0ccf2946a4a21ab1ec23486cadc21864b/libavutil/csp.c#L46-L56
445
+ //
446
+ // For example, the BT.709 coefficients are: kr=0.2126, kg=0.7152, kb=0.0722
447
+ // Coefficients must sum to 1.
448
+ //
449
+ // Conventionally Y is in [0, 1] range, and U and V are in [-0.5, 0.5] range
450
+ // (that's mathematically, in practice they are represented in integer range).
451
+ // The conversion is defined as:
452
+ // https://en.wikipedia.org/wiki/YCbCr#R'G'B'_to_Y%E2%80%B2PbPr
453
+ // Y = kr*R + kg*G + kb*B
454
+ // U = (B - Y) * 0.5 / (1 - kb) = (B - Y) / u_scale where u_scale = 2 * (1 - kb)
455
+ // V = (R - Y) * 0.5 / (1 - kr) = (R - Y) / v_scale where v_scale = 2 * (1 - kr)
456
+ //
457
+ // Putting all this into matrix form, we get:
458
+ // [Y] = [kr kg kb ] [R]
459
+ // [U] [-kr/u_scale -kg/u_scale (1-kb)/u_scale] [G]
460
+ // [V] [(1-kr)/v_scale -kg/v_scale -kb)/v_scale ] [B]
461
+ //
462
+ //
463
+ // Now, to convert YUV to RGB, we just need to invert this matrix:
464
+ // ```py
465
+ // import torch
466
+ // kr, kg, kb = 0.2126, 0.7152, 0.0722 # BT.709 luma coefficients
467
+ // u_scale = 2 * (1 - kb)
468
+ // v_scale = 2 * (1 - kr)
469
+ //
470
+ // rgb_to_yuv = torch.tensor([
471
+ // [kr, kg, kb],
472
+ // [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale],
473
+ // [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale]
474
+ // ])
475
+ //
476
+ // yuv_to_rgb_full = torch.linalg.inv(rgb_to_yuv)
477
+ // print("YUV->RGB matrix (Full Range):")
478
+ // print(yuv_to_rgb_full)
479
+ // ```
480
+ // And we get:
481
+ // tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00],
482
+ // [ 1.0000e+00, -1.8732e-01, -4.6812e-01],
483
+ // [ 1.0000e+00, 1.8556e+00, 4.6231e-09]])
484
+ //
485
+ // Which matches https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
486
+ //
487
+ // Color conversion in NPP
488
+ // -----------------------
489
+ // https://docs.nvidia.com/cuda/npp/image_color_conversion.html.
490
+ //
491
+ // NPP provides different ways to convert YUV to RGB:
492
+ // - pre-defined color conversion functions like
493
+ // nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx and nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx
494
+ // which are for BT.709 limited and full range, respectively.
495
+ // - generic color conversion functions that accept a custom color conversion
496
+ // matrix, called ColorTwist, like nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx
497
+ //
498
+ // We use the pre-defined functions or the color twist functions depending on
499
+ // which one we find to be closer to the CPU results.
500
+ //
501
+ // The color twist functionality is *partially* described in a section named
502
+ // "YUVToRGBColorTwist". Importantly:
503
+ //
504
+ // - The `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` function takes the YUV data
505
+ // and the color-conversion matrix as input. The function itself and the
506
+ // matrix assume different ranges for YUV values:
507
+ // - The **matrix coefficient** must assume that Y is in [0, 1] and U,V are in
508
+ // [-0.5, 0.5]. That's how we defined our matrix above.
509
+ // - The function `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` however expects all
510
+ // of the input Y, U, V to be in [0, 255]. That's how the data comes out of
511
+ // the decoder.
512
+ // - But *internally*, `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` needs U and V to
513
+ // be centered around 0, i.e. in [-128, 127]. So we need to apply a -128
514
+ // offset to U and V. Y doesn't need to be offset. The offset can be applied
515
+ // by adding a 4th column to the matrix.
516
+ //
517
+ //
518
+ // So our conversion matrix becomes the following, with new offset column:
519
+ // tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00, 0]
520
+ // [ 1.0000e+00, -1.8732e-01, -4.6812e-01, -128]
521
+ // [ 1.0000e+00, 1.8556e+00, 4.6231e-09 , -128]])
522
+ //
523
+ // And that's what we need to pass for BT701, full range.
524
+ /* clang-format on */
0 commit comments