Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
cd5f8aa
only 2 pixfmts, enable 6 color param combos
Dan-Flores Dec 11, 2025
939240b
Merge branch 'main' of https://github.com/meta-pytorch/torchcodec int…
Dan-Flores Dec 11, 2025
b538d13
comments
Dan-Flores Dec 11, 2025
0480f1f
comments2
Dan-Flores Dec 11, 2025
351a55d
adjust test, fix pixel format checks
Dan-Flores Dec 12, 2025
ffcf872
keep plumbing, only use nv12
Dan-Flores Dec 12, 2025
a671f31
error sooner on gpu on any pixel format
Dan-Flores Dec 16, 2025
a9ad8e0
skip non-default color params on 4+6, skip av1 gpu on 4
Dan-Flores Dec 16, 2025
261549d
Merge branch 'main' of https://github.com/meta-pytorch/torchcodec int…
Dan-Flores Dec 16, 2025
f4d777c
remove unused option
Dan-Flores Dec 16, 2025
da3a6d7
reduce diff
Dan-Flores Dec 16, 2025
1dc7690
add TODO, liink issue
Dan-Flores Dec 17, 2025
ce86d61
restore None test case
Dan-Flores Dec 17, 2025
daf2fda
reuse codecContext color params, no hardcoded defaults
Dan-Flores Dec 17, 2025
8c2bcee
3 decimal places
Dan-Flores Jan 6, 2026
193d7c9
Merge branch 'main' of https://github.com/meta-pytorch/torchcodec int…
Dan-Flores Jan 6, 2026
e83c130
Merge branch 'gpu_pix_fmts' of https://github.com/Dan-Flores/torchcod…
Dan-Flores Jan 6, 2026
e63f118
actually 4 decimal places
Dan-Flores Jan 7, 2026
5e8745f
restore higher decimal points for BT2020
Dan-Flores Jan 7, 2026
d1cbaeb
add note, test adjustments
Dan-Flores Jan 8, 2026
ef03ad5
drop targetPixelFormat from CUDA api
Dan-Flores Jan 8, 2026
40f4c66
skip 3 failing checks on ffmpeg 7/8
Dan-Flores Jan 9, 2026
e28081f
move cuda pix fmt to DeviceInterface.h, test fixes
Dan-Flores Jan 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 125 additions & 23 deletions src/torchcodec/_core/CudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -383,16 +383,120 @@ std::string CudaDeviceInterface::getDetails() {
// Below are methods exclusive to video encoding:
// --------------------------------------------------------------------------
namespace {
// RGB to NV12 color conversion matrix for BT.601 limited range.
// NPP ColorTwist function used below expects the limited range
// color conversion matrix, and this matches FFmpeg's default behavior.
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
{0.257f, 0.504f, 0.098f, 16.0f},
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
{-0.148f, -0.291f, 0.439f, 128.0f},
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
{0.439f, -0.368f, -0.071f, 128.0f}};
// Note: [RGB -> YUV Color Conversion, limited color range]
//
// For context on this subject, first read the note:
// [YUV -> RGB Color Conversion, color space and color range]
// https://github.com/meta-pytorch/torchcodec/blob/main/src/torchcodec/_core/CUDACommon.cpp#L63-L65
//
// Lets encode RGB -> YUV in the limited color range for BT.601 color space.
// In limited range, the [0, 255] range is mapped into [16-235] for Y, and into
// [16-240] for U,V.
// To implement, we get the full range conversion matrix as before, then scale:
// - Y channel: scale by (235-16)/255 = 219/255
// - U,V channels: scale by (240-16)/255 = 224/255
// https://en.wikipedia.org/wiki/YCbCr#Y%E2%80%B2PbPr_to_Y%E2%80%B2CbCr
//
// ```py
// import torch
// kr, kg, kb = 0.299, 0.587, 0.114 # BT.601 luma coefficients
// u_scale = 2 * (1 - kb)
// v_scale = 2 * (1 - kr)
//
// rgb_to_yuv_full = torch.tensor([
// [kr, kg, kb],
// [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale],
// [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale]
// ])
//
// full_to_limited_y_scale = 219.0 / 255.0
// full_to_limited_uv_scale = 224.0 / 255.0
//
// rgb_to_yuv_limited = rgb_to_yuv_full * torch.tensor([
// [full_to_limited_y_scale],
// [full_to_limited_uv_scale],
// [full_to_limited_uv_scale]
// ])
//
// print("RGB->YUV matrix (Limited Range BT.601):")
// print(rgb_to_yuv_limited)
// ```
//
// This yields:
// tensor([[ 0.2568, 0.5041, 0.0979],
// [-0.1482, -0.2910, 0.4392],
// [ 0.4392, -0.3678, -0.0714]])
//
// Which matches https://fourcc.org/fccyvrgb.php
//
// To perform color conversion in NPP, we are required to provide these color
// conversion matrices to ColorTwist functions, for example,
// `nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx`.
// https://docs.nvidia.com/cuda/npp/image_color_conversion.html
//
// These offsets are added in the 4th column of each conversion matrix below.
// - In limited range, Y is offset by 16 to add the lower margin.
// - In both color ranges, U,V are offset by 128 to be centered around 0.
//
// RGB to YUV conversion matrices to use in NPP color conversion functions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the note, it will be super useful if we ever need to go back to this in the future.

struct ColorConversionMatrices {
static constexpr Npp32f BT601_LIMITED[3][4] = {
{0.2568f, 0.5041f, 0.0979f, 16.0f},
{-0.1482f, -0.2910f, 0.4392f, 128.0f},
{0.4392f, -0.3678f, -0.0714f, 128.0f}};

static constexpr Npp32f BT601_FULL[3][4] = {
{0.2990f, 0.5870f, 0.1140f, 0.0f},
{-0.1687f, -0.3313f, 0.5000f, 128.0f},
{0.5000f, -0.4187f, -0.0813f, 128.0f}};

static constexpr Npp32f BT709_LIMITED[3][4] = {
{0.1826f, 0.6142f, 0.0620f, 16.0f},
{-0.1006f, -0.3386f, 0.4392f, 128.0f},
{0.4392f, -0.3989f, -0.0403f, 128.0f}};

static constexpr Npp32f BT709_FULL[3][4] = {
{0.2126f, 0.7152f, 0.0722f, 0.0f},
{-0.1146f, -0.3854f, 0.5000f, 128.0f},
{0.5000f, -0.4542f, -0.0458f, 128.0f}};

static constexpr Npp32f BT2020_LIMITED[3][4] = {
{0.2256f, 0.5823f, 0.0509f, 16.0f},
{-0.1227f, -0.3166f, 0.4392f, 128.0f},
{0.4392f, -0.4039f, -0.0353f, 128.0f}};

static constexpr Npp32f BT2020_FULL[3][4] = {
{0.2627f, 0.6780f, 0.0593f, 0.0f},
{-0.139630f, -0.360370f, 0.5000f, 128.0f},
{0.5000f, -0.459786f, -0.040214f, 128.0f}};
};

// Returns conversion matrix based on codec context color space and range
const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] {
if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range
codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) {
if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
return ColorConversionMatrices::BT601_LIMITED;
} else if (codecContext->colorspace == AVCOL_SPC_BT709) {
return ColorConversionMatrices::BT709_LIMITED;
} else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
return ColorConversionMatrices::BT2020_LIMITED;
} else { // default to BT.601
return ColorConversionMatrices::BT601_LIMITED;
}
} else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range
if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
return ColorConversionMatrices::BT601_FULL;
} else if (codecContext->colorspace == AVCOL_SPC_BT709) {
return ColorConversionMatrices::BT709_FULL;
} else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
return ColorConversionMatrices::BT2020_FULL;
} else { // default to BT.601
return ColorConversionMatrices::BT601_FULL;
}
}
return ColorConversionMatrices::BT601_LIMITED;
}
} // namespace

UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
Expand Down Expand Up @@ -437,26 +541,26 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();

NppiSize oSizeROI = {width, height};
NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
NppStatus status;
// Convert to NV12, as CUDA_ENCODING_PIXEL_FORMAT is always NV12 currently
status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
validateInt64ToInt(
hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"),
hwcFrame.stride(0) * hwcFrame.element_size(),
avFrame->data,
avFrame->linesize,
oSizeROI,
defaultLimitedRangeRgbToNv12,
getConversionMatrix(codecContext),
*nppCtx_);

TORCH_CHECK(
status == NPP_SUCCESS,
"Failed to convert RGB to NV12: NPP error code ",
"Failed to convert RGB to ",
av_get_pix_fmt_name(DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT),
": NPP error code ",
status);

// TODO-VideoEncoder: Enable configuration of color properties, similar to
// FFmpeg. Below are the default color properties used by FFmpeg.
avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range

avFrame->colorspace = codecContext->colorspace;
avFrame->color_range = codecContext->color_range;
return avFrame;
}

Expand All @@ -474,9 +578,7 @@ void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
hwFramesCtxRef != nullptr,
"Failed to allocate hardware frames context for codec");

// TODO-VideoEncoder: Enable user set pixel formats to be set
// (outPixelFormat_) and handled with the appropriate NPP function
codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
codecContext->sw_pix_fmt = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT;
// Always set pixel format to support CUDA encoding.
codecContext->pix_fmt = AV_PIX_FMT_CUDA;

Expand Down
3 changes: 3 additions & 0 deletions src/torchcodec/_core/DeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ class DeviceInterface {
return "";
}

// Pixel format used for encoding on CUDA devices
static constexpr AVPixelFormat CUDA_ENCODING_PIXEL_FORMAT = AV_PIX_FMT_NV12;

// Function used for video encoding, only implemented in CudaDeviceInterface.
// It is here to isolate CUDA dependencies from CPU builds
// TODO Video-Encoder: Reconsider using video encoding functions in device
Expand Down
27 changes: 17 additions & 10 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -782,23 +782,30 @@ void VideoEncoder::initializeEncoder(
outHeight_ = inHeight_;

if (videoStreamOptions.pixelFormat.has_value()) {
// TODO-VideoEncoder: Enable pixel formats to be set by user
// and handled with the appropriate NPP function on GPU.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved this TODO from setupHardwareFrameContextForEncoding to here in initializeEncoder to centralize pixel_format handling.

The behavior is unchanged: If pixel_format argument is used while frames are on GPU, an error is raised.
The default usage of nv12 is moved into initializeEncoder` as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we raising an error for pixel_format on gpu because of what nicolas mentioned below? that passing NV12 leads to yuv420?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essentially yes. Because we do not understand the codec's behavior yet, we do not want the user to set or expect a pixel format.

if (frames_.device().is_cuda()) {
TORCH_CHECK(
false,
"GPU Video encoding currently only supports the NV12 pixel format. "
"Do not set pixel_format to use NV12.");
"Video encoding on GPU currently only supports the nv12 pixel format. "
"Do not set pixel_format to use nv12 by default.");
}
outPixelFormat_ =
validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
} else {
const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
// Use first listed pixel format as default (often yuv420p).
// This is similar to FFmpeg's logic:
// https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
// If pixel formats are undefined for some reason, try yuv420p
outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
? formats[0]
: AV_PIX_FMT_YUV420P;
if (frames_.device().is_cuda()) {
// Default to nv12 pixel format when encoding on GPU.
outPixelFormat_ = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT;
} else {
const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
// Use first listed pixel format as default (often yuv420p).
// This is similar to FFmpeg's logic:
// https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
// If pixel formats are undefined for some reason, try yuv420p
outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
? formats[0]
: AV_PIX_FMT_YUV420P;
}
}

// Configure codec parameters
Expand Down
88 changes: 75 additions & 13 deletions test/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,9 +777,9 @@ def test_pixel_format_errors(self, method, device, tmp_path):
if device == "cuda":
with pytest.raises(
RuntimeError,
match="GPU Video encoding currently only supports the NV12 pixel format. Do not set pixel_format to use NV12",
match="Video encoding on GPU currently only supports the nv12 pixel format. Do not set pixel_format to use nv12 by default.",
):
getattr(encoder, method)(**valid_params, pixel_format="yuv420p")
getattr(encoder, method)(**valid_params, pixel_format="yuv444p")
return

with pytest.raises(
Expand Down Expand Up @@ -1345,13 +1345,37 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
pytest.param(
"mkv",
"av1_nvenc",
marks=pytest.mark.skipif(
IN_GITHUB_CI, reason="av1_nvenc is not supported on CI"
),
marks=[
pytest.mark.skipif(
IN_GITHUB_CI, reason="av1_nvenc is not supported on CI"
),
pytest.mark.skipif(
get_ffmpeg_major_version() == 4,
reason="av1_nvenc is not supported on FFmpeg 4",
),
],
),
],
)
def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
# We test the color space and color range parameters in this test, because
# we are required to define matrices specific to these specs when using NPP, see note:
# [RGB -> YUV Color Conversion, limited color range]
# BT.601, BT.709, BT.2020
@pytest.mark.parametrize("color_space", ("bt470bg", "bt709", "bt2020nc", None))
# Full/PC range, Limited/TV range
@pytest.mark.parametrize("color_range", ("pc", "tv", None))
def test_nvenc_against_ffmpeg_cli(
self, tmp_path, method, format, codec, color_space, color_range
):
ffmpeg_version = get_ffmpeg_major_version()
# TODO-VideoEncoder: Investigate why FFmpeg 4 and 6 fail with non-default color space and range.
# See https://github.com/meta-pytorch/torchcodec/issues/1140
if ffmpeg_version in (4, 6) and not (
color_space == "bt470bg" and color_range == "tv"
):
pytest.skip(
"Non-default color space and range have lower accuracy on FFmpeg 4 and 6"
)
# Encode with FFmpeg CLI using nvenc codecs
device = "cuda"
qp = 1 # Use near lossless encoding to reduce noise and support av1_nvenc
Expand Down Expand Up @@ -1379,16 +1403,23 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
temp_raw_path,
]
# CLI requires explicit codec for nvenc
# VideoEncoder will default to h264_nvenc since the frames are on GPU.
ffmpeg_cmd.extend(["-c:v", codec if codec is not None else "h264_nvenc"])
# VideoEncoder will select an NVENC encoder by default since the frames are on GPU.

ffmpeg_cmd.extend(["-pix_fmt", "nv12"]) # Output format is always NV12
ffmpeg_cmd.extend(["-qp", str(qp)])
if color_space:
ffmpeg_cmd.extend(["-colorspace", color_space])
if color_range:
ffmpeg_cmd.extend(["-color_range", color_range])
ffmpeg_cmd.extend([ffmpeg_encoded_path])
subprocess.run(ffmpeg_cmd, check=True, capture_output=True)

encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate)
encoder_extra_options = {"qp": qp}
if color_space:
encoder_extra_options["colorspace"] = color_space
if color_range:
encoder_extra_options["color_range"] = color_range
if method == "to_file":
encoder_output_path = str(tmp_path / f"nvenc_output.{format}")
encoder.to_file(
Expand Down Expand Up @@ -1424,8 +1455,39 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=96, atol=2)

if method == "to_file":
ffmpeg_metadata = self._get_video_metadata(ffmpeg_encoded_path, ["pix_fmt"])
encoder_metadata = self._get_video_metadata(encoder_output, ["pix_fmt"])
# pix_fmt nv12 is stored as yuv420p in metadata
assert encoder_metadata["pix_fmt"] == "yuv420p"
assert ffmpeg_metadata["pix_fmt"] == "yuv420p"
metadata_fields = ["pix_fmt", "color_range", "color_space"]
ffmpeg_metadata = self._get_video_metadata(
ffmpeg_encoded_path, metadata_fields
)
encoder_metadata = self._get_video_metadata(encoder_output, metadata_fields)
# pix_fmt nv12 is stored as yuv420p in metadata, unless full range (pc)is used
# In that case, h264 and hevc NVENC codecs will use yuvj420p automatically.
if color_range == "pc" and codec != "av1_nvenc":
expected_pix_fmt = "yuvj420p"
else:
# av1_nvenc does not utilize the yuvj420p pixel format
expected_pix_fmt = "yuv420p"
assert (
encoder_metadata["pix_fmt"]
== ffmpeg_metadata["pix_fmt"]
== expected_pix_fmt
)

assert encoder_metadata["color_range"] == ffmpeg_metadata["color_range"]
assert encoder_metadata["color_space"] == ffmpeg_metadata["color_space"]
# Default values vary by codec, so we only assert when
# color_range and color_space are not None.
if color_range is not None:
# FFmpeg and torchcodec encode color_range as 'unknown' for mov and avi
# when color_range='tv' and color_space=None on FFmpeg 7/8.
# Since this failure is rare, I suspect its a bug related to these
# older container formats on newer FFmpeg versions.
if not (
ffmpeg_version in (7, 8)
and color_range == "tv"
and color_space is None
and format in ("mov", "avi")
):
assert color_range == encoder_metadata["color_range"]
if color_space is not None:
assert color_space == encoder_metadata["color_space"]
Loading