Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
cd5f8aa
only 2 pixfmts, enable 6 color param combos
Dan-Flores Dec 11, 2025
939240b
Merge branch 'main' of https://github.com/meta-pytorch/torchcodec int…
Dan-Flores Dec 11, 2025
b538d13
comments
Dan-Flores Dec 11, 2025
0480f1f
comments2
Dan-Flores Dec 11, 2025
351a55d
adjust test, fix pixel format checks
Dan-Flores Dec 12, 2025
ffcf872
keep plumbing, only use nv12
Dan-Flores Dec 12, 2025
a671f31
error sooner on gpu on any pixel format
Dan-Flores Dec 16, 2025
a9ad8e0
skip non-default color params on 4+6, skip av1 gpu on 4
Dan-Flores Dec 16, 2025
261549d
Merge branch 'main' of https://github.com/meta-pytorch/torchcodec int…
Dan-Flores Dec 16, 2025
f4d777c
remove unused option
Dan-Flores Dec 16, 2025
da3a6d7
reduce diff
Dan-Flores Dec 16, 2025
1dc7690
add TODO, liink issue
Dan-Flores Dec 17, 2025
ce86d61
restore None test case
Dan-Flores Dec 17, 2025
daf2fda
reuse codecContext color params, no hardcoded defaults
Dan-Flores Dec 17, 2025
8c2bcee
3 decimal places
Dan-Flores Jan 6, 2026
193d7c9
Merge branch 'main' of https://github.com/meta-pytorch/torchcodec int…
Dan-Flores Jan 6, 2026
e83c130
Merge branch 'gpu_pix_fmts' of https://github.com/Dan-Flores/torchcod…
Dan-Flores Jan 6, 2026
e63f118
actually 4 decimal places
Dan-Flores Jan 7, 2026
5e8745f
restore higher decimal points for BT2020
Dan-Flores Jan 7, 2026
d1cbaeb
add note, test adjustments
Dan-Flores Jan 8, 2026
ef03ad5
drop targetPixelFormat from CUDA api
Dan-Flores Jan 8, 2026
40f4c66
skip 3 failing checks on ffmpeg 7/8
Dan-Flores Jan 9, 2026
e28081f
move cuda pix fmt to DeviceInterface.h, test fixes
Dan-Flores Jan 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 101 additions & 29 deletions src/torchcodec/_core/CudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,22 +373,72 @@ std::string CudaDeviceInterface::getDetails() {
// Below are methods exclusive to video encoding:
// --------------------------------------------------------------------------
namespace {
// RGB to NV12 color conversion matrix for BT.601 limited range.
// NPP ColorTwist function used below expects the limited range
// color conversion matrix, and this matches FFmpeg's default behavior.
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
{0.257f, 0.504f, 0.098f, 16.0f},
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
{-0.148f, -0.291f, 0.439f, 128.0f},
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
{0.439f, -0.368f, -0.071f, 128.0f}};
// RGB to YUV conversion matrices to use in NPP color conversion functions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the note, it will be super useful if we ever need to go back to this in the future.

struct ColorConversionMatrices {
static constexpr Npp32f BT601_LIMITED[3][4] = {
{0.257f, 0.504f, 0.098f, 16.0f},
{-0.148f, -0.291f, 0.439f, 128.0f},
{0.439f, -0.368f, -0.071f, 128.0f}};

static constexpr Npp32f BT601_FULL[3][4] = {
{0.299f, 0.587f, 0.114f, 0.0f},
{-0.168736f, -0.331264f, 0.5f, 128.0f},
{0.5f, -0.418688f, -0.081312f, 128.0f}};

static constexpr Npp32f BT709_LIMITED[3][4] = {
{0.183f, 0.614f, 0.062f, 16.0f},
{-0.101f, -0.338f, 0.439f, 128.0f},
{0.439f, -0.399f, -0.040f, 128.0f}};

static constexpr Npp32f BT709_FULL[3][4] = {
{0.2126f, 0.7152f, 0.0722f, 0.0f},
{-0.114572f, -0.385428f, 0.5f, 128.0f},
{0.5f, -0.454153f, -0.045847f, 128.0f}};

static constexpr Npp32f BT2020_LIMITED[3][4] = {
{0.2256f, 0.5823f, 0.0509f, 16.0f},
{-0.122f, -0.315f, 0.439f, 128.0f},
{0.439f, -0.403f, -0.036f, 128.0f}};

static constexpr Npp32f BT2020_FULL[3][4] = {
{0.2627f, 0.6780f, 0.0593f, 0.0f},
{-0.139630f, -0.360370f, 0.5f, 128.0f},
{0.5f, -0.459786f, -0.040214f, 128.0f}};
};

// Returns conversion matrix based on codec context color space and range
const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] {
if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range
codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) {
if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
return ColorConversionMatrices::BT601_LIMITED;
} else if (codecContext->colorspace == AVCOL_SPC_BT709) {
return ColorConversionMatrices::BT709_LIMITED;
} else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
return ColorConversionMatrices::BT2020_LIMITED;
} else { // default to BT.601
return ColorConversionMatrices::BT601_LIMITED;
}
} else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range
if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
return ColorConversionMatrices::BT601_FULL;
} else if (codecContext->colorspace == AVCOL_SPC_BT709) {
return ColorConversionMatrices::BT709_FULL;
} else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
return ColorConversionMatrices::BT2020_FULL;
} else { // default to BT.601
return ColorConversionMatrices::BT601_FULL;
}
}
return ColorConversionMatrices::BT601_LIMITED;
}
} // namespace

UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
const torch::Tensor& tensor,
int frameIndex,
AVCodecContext* codecContext) {
AVCodecContext* codecContext,
AVPixelFormat targetPixelFormat) {
TORCH_CHECK(
tensor.dim() == 3 && tensor.size(0) == 3,
"Expected 3D RGB tensor (CHW format), got shape: ",
Expand Down Expand Up @@ -427,25 +477,48 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();

NppiSize oSizeROI = {width, height};
NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
validateInt64ToInt(
hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"),
avFrame->data,
avFrame->linesize,
oSizeROI,
defaultLimitedRangeRgbToNv12,
*nppCtx_);
NppStatus status;
switch (targetPixelFormat) {
case AV_PIX_FMT_NV12:
status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
hwcFrame.stride(0) * hwcFrame.element_size(),
avFrame->data,
avFrame->linesize,
oSizeROI,
getConversionMatrix(codecContext),
*nppCtx_);
break;
case AV_PIX_FMT_YUV420P:
status = nppiRGBToYUV420_8u_ColorTwist32f_C3P3R_Ctx(
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
hwcFrame.stride(0) * hwcFrame.element_size(),
avFrame->data,
avFrame->linesize,
oSizeROI,
getConversionMatrix(codecContext),
*nppCtx_);
break;
default:
TORCH_CHECK(
false,
"CUDA encoding only supports NV12 and YUV420P formats, got ",
av_get_pix_fmt_name(targetPixelFormat));
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above: this is related to my other comment below about enforcing targetPixelFormat to be AV_PIX_FMT_NV12 through the APIs. It's great that you are raising on this, and indeed this should never happen and this is a torchcodec bug. But we can avoid risking the bug entirely by simply just not making targetPixelFormat a parameter! That's the value of "enforcing the invariant through the API".


TORCH_CHECK(
status == NPP_SUCCESS,
"Failed to convert RGB to NV12: NPP error code ",
"Failed to convert RGB to ",
av_get_pix_fmt_name(targetPixelFormat),
": NPP error code ",
status);

// TODO-VideoEncoder: Enable configuration of color properties, similar to
// FFmpeg. Below are the default color properties used by FFmpeg.
avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
avFrame->colorspace = codecContext->colorspace != AVCOL_SPC_UNSPECIFIED
? codecContext->colorspace
: AVCOL_SPC_BT470BG; // BT.601
avFrame->color_range = codecContext->color_range != AVCOL_RANGE_UNSPECIFIED
? codecContext->color_range
: AVCOL_RANGE_MPEG; // limited range

return avFrame;
}
Expand All @@ -454,7 +527,8 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
// to enable encoding with CUDA device. The hw_frames_ctx field is needed by
// FFmpeg to allocate frames on GPU's memory.
void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
AVCodecContext* codecContext) {
AVCodecContext* codecContext,
AVPixelFormat targetPixelFormat) {
TORCH_CHECK(codecContext != nullptr, "codecContext is null");
TORCH_CHECK(
hardwareDeviceCtx_, "Hardware device context has not been initialized");
Expand All @@ -464,9 +538,7 @@ void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
hwFramesCtxRef != nullptr,
"Failed to allocate hardware frames context for codec");

// TODO-VideoEncoder: Enable user set pixel formats to be set
// (outPixelFormat_) and handled with the appropriate NPP function
codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
codecContext->sw_pix_fmt = targetPixelFormat;
// Always set pixel format to support CUDA encoding.
codecContext->pix_fmt = AV_PIX_FMT_CUDA;

Expand Down
6 changes: 4 additions & 2 deletions src/torchcodec/_core/CudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,12 @@ class CudaDeviceInterface : public DeviceInterface {
UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
const torch::Tensor& tensor,
int frameIndex,
AVCodecContext* codecContext) override;
AVCodecContext* codecContext,
AVPixelFormat targetPixelFormat) override;

void setupHardwareFrameContextForEncoding(
AVCodecContext* codecContext) override;
AVCodecContext* codecContext,
AVPixelFormat targetPixelFormat) override;

private:
// Our CUDA decoding code assumes NV12 format. In order to handle other
Expand Down
6 changes: 4 additions & 2 deletions src/torchcodec/_core/DeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,14 +145,16 @@ class DeviceInterface {
virtual UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
[[maybe_unused]] const torch::Tensor& tensor,
[[maybe_unused]] int frameIndex,
[[maybe_unused]] AVCodecContext* codecContext) {
[[maybe_unused]] AVCodecContext* codecContext,
[[maybe_unused]] AVPixelFormat targetPixelFormat) {
TORCH_CHECK(false);
}

// Function used for video encoding, only implemented in CudaDeviceInterface.
// It is here to isolate CUDA dependencies from CPU builds
virtual void setupHardwareFrameContextForEncoding(
[[maybe_unused]] AVCodecContext* codecContext) {
[[maybe_unused]] AVCodecContext* codecContext,
[[maybe_unused]] AVPixelFormat targetPixelFormat) {
TORCH_CHECK(false);
}

Expand Down
33 changes: 20 additions & 13 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -775,23 +775,30 @@ void VideoEncoder::initializeEncoder(
outHeight_ = inHeight_;

if (videoStreamOptions.pixelFormat.has_value()) {
if (frames_.device().is_cuda()) {
if (frames_.device().is_cuda() &&
!(outPixelFormat_ == AV_PIX_FMT_NV12 ||
outPixelFormat_ != AV_PIX_FMT_YUV420P)) {
TORCH_CHECK(
false,
"GPU Video encoding currently only supports the NV12 pixel format. "
"Do not set pixel_format to use NV12.");
"GPU encoding only supports NV12 and YUV420P formats, got ",
av_get_pix_fmt_name(outPixelFormat_));
}
outPixelFormat_ =
validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
} else {
const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
// Use first listed pixel format as default (often yuv420p).
// This is similar to FFmpeg's logic:
// https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
// If pixel formats are undefined for some reason, try yuv420p
outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
? formats[0]
: AV_PIX_FMT_YUV420P;
if (frames_.device().is_cuda()) {
// Default to YUV420P for CUDA encoding if unset.
outPixelFormat_ = AV_PIX_FMT_YUV420P;
} else {
const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
// Use first listed pixel format as default (often yuv420p).
// This is similar to FFmpeg's logic:
// https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
// If pixel formats are undefined for some reason, try yuv420p
outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
? formats[0]
: AV_PIX_FMT_YUV420P;
}
}

// Configure codec parameters
Expand Down Expand Up @@ -837,7 +844,7 @@ void VideoEncoder::initializeEncoder(
if (frames_.device().is_cuda() && deviceInterface_) {
deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
deviceInterface_->setupHardwareFrameContextForEncoding(
avCodecContext_.get());
avCodecContext_.get(), outPixelFormat_);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, we have this invariant throughout the GPU encoder that the output pixel format is always AV_PIX_FMT_NV12. it's good practice to enforce this invariant in the APIs themselves. Here for example, there is no need for deviceInterface_->setupHardwareFrameContextForEncoding() to take the pixel format as a parameter, because it's an invariant that it should always be AV_PIX_FMT_NV12.

The same is true for all the other APIs of the device interface. There are different ways to enforce this invariant in the API, one simple one is to make outputPixelFormatForEncoding_ a public constexpr member.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was aiming to follow up quickly on adding support for other pixel formats, but I see how this parameter is confusing at the moment. I will remove it from the APIs until it is needed.

}

int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
Expand Down Expand Up @@ -883,7 +890,7 @@ void VideoEncoder::encode() {
UniqueAVFrame avFrame;
if (frames_.device().is_cuda() && deviceInterface_) {
auto cudaFrame = deviceInterface_->convertCUDATensorToAVFrameForEncoding(
currFrame, i, avCodecContext_.get());
currFrame, i, avCodecContext_.get(), outPixelFormat_);
TORCH_CHECK(
cudaFrame != nullptr,
"convertCUDATensorToAVFrameForEncoding failed for frame ",
Expand Down
48 changes: 37 additions & 11 deletions test/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,9 +780,9 @@ def test_pixel_format_errors(self, method, device, tmp_path):
if device == "cuda":
with pytest.raises(
RuntimeError,
match="GPU Video encoding currently only supports the NV12 pixel format. Do not set pixel_format to use NV12",
match="GPU encoding currently only supports NV12 and YUV420P pixel formats, got yuv444p",
):
getattr(encoder, method)(**valid_params, pixel_format="yuv420p")
getattr(encoder, method)(**valid_params, pixel_format="yuv444p")
return

with pytest.raises(
Expand Down Expand Up @@ -1353,8 +1353,14 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
],
)
@pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like"))
# TODO-VideoEncoder: Enable additional pixel formats ("yuv420p", "yuv444p")
def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
@pytest.mark.parametrize("pixel_format", ("nv12", "yuv420p", None))
# BT.601, BT.709, BT.2020
@pytest.mark.parametrize("color_space", ("bt470bg", "bt709", "bt2020nc"))
# Full/PC range, Limited/TV range
@pytest.mark.parametrize("color_range", ("pc", "tv"))
def test_nvenc_against_ffmpeg_cli(
self, tmp_path, format_codec, method, pixel_format, color_space, color_range
):
# Encode with FFmpeg CLI using nvenc codecs
format, codec = format_codec
device = "cuda"
Expand Down Expand Up @@ -1385,7 +1391,14 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
codec, # Use specified NVENC hardware encoder
]

ffmpeg_cmd.extend(["-pix_fmt", "nv12"]) # Output format is always NV12
if color_space:
ffmpeg_cmd.extend(["-colorspace", color_space])
if color_range:
ffmpeg_cmd.extend(["-color_range", color_range])
if pixel_format:
ffmpeg_cmd.extend(["-pix_fmt", pixel_format])
else: # VideoEncoder will default to yuv420p for nvenc codecs
ffmpeg_cmd.extend(["-pix_fmt", "yuv420p"])
if codec == "av1_nvenc":
ffmpeg_cmd.extend(["-rc", "constqp"]) # Set rate control mode for AV1
ffmpeg_cmd.extend(["-qp", str(qp)]) # Use lossless qp for other codecs
Expand All @@ -1396,18 +1409,24 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
encoder_extra_options = {"qp": qp}
if codec == "av1_nvenc":
encoder_extra_options["rc"] = 0 # constqp mode
if color_space:
encoder_extra_options["colorspace"] = color_space
if color_range:
encoder_extra_options["color_range"] = color_range
if method == "to_file":
encoder_output_path = str(tmp_path / f"nvenc_output.{format}")
encoder.to_file(
dest=encoder_output_path,
codec=codec,
pixel_format=pixel_format,
extra_options=encoder_extra_options,
)
encoder_output = encoder_output_path
elif method == "to_tensor":
encoder_output = encoder.to_tensor(
format=format,
codec=codec,
pixel_format=pixel_format,
extra_options=encoder_extra_options,
)
elif method == "to_file_like":
Expand All @@ -1416,6 +1435,7 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
file_like=file_like,
format=format,
codec=codec,
pixel_format=pixel_format,
extra_options=encoder_extra_options,
)
encoder_output = file_like.getvalue()
Expand All @@ -1426,13 +1446,19 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
encoder_frames = self.decode(encoder_output).data

assert ffmpeg_frames.shape[0] == encoder_frames.shape[0]
# The combination of full range + bt709 results in worse accuracy
percentage = 91 if color_range == "full" and color_space == "bt709" else 96
for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames):
assert psnr(ff_frame, enc_frame) > 25
assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=96, atol=2)
assert_tensor_close_on_at_least(
ff_frame, enc_frame, percentage=percentage, atol=2
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity can you try to validate an upper bound on the absolute difference? E.g. something like

assert_close(..., rtol=0, atol=5)

would be pretty good. Might have to be higher than 5. If it's something like 50 then it's not worth it, as it's not informative. But if we can get a low upper-bound, that gives confidence that we're not too far off.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lowest bound I tested that passed was atol=40, which is pretty high. The current test tells us that 96% of the frames are within 2, which seems to be fairly high accuracy


if method == "to_file":
ffmpeg_metadata = self._get_video_metadata(ffmpeg_encoded_path, ["pix_fmt"])
encoder_metadata = self._get_video_metadata(encoder_output, ["pix_fmt"])
# pix_fmt nv12 is stored as yuv420p in metadata
assert encoder_metadata["pix_fmt"] == "yuv420p"
assert ffmpeg_metadata["pix_fmt"] == "yuv420p"
metadata_fields = ["color_range", "color_space"]
ffmpeg_metadata = self._get_video_metadata(
ffmpeg_encoded_path, metadata_fields
)
encoder_metadata = self._get_video_metadata(encoder_output, metadata_fields)
assert encoder_metadata["color_range"] == ffmpeg_metadata["color_range"]
assert encoder_metadata["color_space"] == ffmpeg_metadata["color_space"]
Loading