-
Notifications
You must be signed in to change notification settings - Fork 90
Add color handling to VideoEncoder GPU #1125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
cd5f8aa
939240b
b538d13
0480f1f
351a55d
ffcf872
a671f31
a9ad8e0
261549d
f4d777c
da3a6d7
1dc7690
ce86d61
daf2fda
8c2bcee
193d7c9
e83c130
e63f118
5e8745f
d1cbaeb
ef03ad5
40f4c66
e28081f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -373,22 +373,72 @@ std::string CudaDeviceInterface::getDetails() { | |
| // Below are methods exclusive to video encoding: | ||
| // -------------------------------------------------------------------------- | ||
| namespace { | ||
| // RGB to NV12 color conversion matrix for BT.601 limited range. | ||
| // NPP ColorTwist function used below expects the limited range | ||
| // color conversion matrix, and this matches FFmpeg's default behavior. | ||
| const Npp32f defaultLimitedRangeRgbToNv12[3][4] = { | ||
| // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B) | ||
| {0.257f, 0.504f, 0.098f, 16.0f}, | ||
| // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients) | ||
| {-0.148f, -0.291f, 0.439f, 128.0f}, | ||
| // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients) | ||
| {0.439f, -0.368f, -0.071f, 128.0f}}; | ||
| // RGB to YUV conversion matrices to use in NPP color conversion functions | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the note, it will be super useful if we ever need to go back to this in the future. |
||
| struct ColorConversionMatrices { | ||
| static constexpr Npp32f BT601_LIMITED[3][4] = { | ||
| {0.257f, 0.504f, 0.098f, 16.0f}, | ||
| {-0.148f, -0.291f, 0.439f, 128.0f}, | ||
| {0.439f, -0.368f, -0.071f, 128.0f}}; | ||
|
|
||
| static constexpr Npp32f BT601_FULL[3][4] = { | ||
| {0.299f, 0.587f, 0.114f, 0.0f}, | ||
| {-0.168736f, -0.331264f, 0.5f, 128.0f}, | ||
| {0.5f, -0.418688f, -0.081312f, 128.0f}}; | ||
|
|
||
| static constexpr Npp32f BT709_LIMITED[3][4] = { | ||
| {0.183f, 0.614f, 0.062f, 16.0f}, | ||
| {-0.101f, -0.338f, 0.439f, 128.0f}, | ||
| {0.439f, -0.399f, -0.040f, 128.0f}}; | ||
|
|
||
| static constexpr Npp32f BT709_FULL[3][4] = { | ||
| {0.2126f, 0.7152f, 0.0722f, 0.0f}, | ||
| {-0.114572f, -0.385428f, 0.5f, 128.0f}, | ||
| {0.5f, -0.454153f, -0.045847f, 128.0f}}; | ||
|
|
||
| static constexpr Npp32f BT2020_LIMITED[3][4] = { | ||
| {0.2256f, 0.5823f, 0.0509f, 16.0f}, | ||
| {-0.122f, -0.315f, 0.439f, 128.0f}, | ||
| {0.439f, -0.403f, -0.036f, 128.0f}}; | ||
|
|
||
| static constexpr Npp32f BT2020_FULL[3][4] = { | ||
| {0.2627f, 0.6780f, 0.0593f, 0.0f}, | ||
| {-0.139630f, -0.360370f, 0.5f, 128.0f}, | ||
| {0.5f, -0.459786f, -0.040214f, 128.0f}}; | ||
| }; | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // Returns conversion matrix based on codec context color space and range | ||
| const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] { | ||
| if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range | ||
| codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) { | ||
| if (codecContext->colorspace == AVCOL_SPC_BT470BG) { | ||
| return ColorConversionMatrices::BT601_LIMITED; | ||
| } else if (codecContext->colorspace == AVCOL_SPC_BT709) { | ||
| return ColorConversionMatrices::BT709_LIMITED; | ||
| } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) { | ||
| return ColorConversionMatrices::BT2020_LIMITED; | ||
| } else { // default to BT.601 | ||
| return ColorConversionMatrices::BT601_LIMITED; | ||
| } | ||
| } else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range | ||
| if (codecContext->colorspace == AVCOL_SPC_BT470BG) { | ||
| return ColorConversionMatrices::BT601_FULL; | ||
| } else if (codecContext->colorspace == AVCOL_SPC_BT709) { | ||
| return ColorConversionMatrices::BT709_FULL; | ||
| } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) { | ||
| return ColorConversionMatrices::BT2020_FULL; | ||
| } else { // default to BT.601 | ||
| return ColorConversionMatrices::BT601_FULL; | ||
| } | ||
| } | ||
| return ColorConversionMatrices::BT601_LIMITED; | ||
| } | ||
| } // namespace | ||
|
|
||
| UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( | ||
| const torch::Tensor& tensor, | ||
| int frameIndex, | ||
| AVCodecContext* codecContext) { | ||
| AVCodecContext* codecContext, | ||
| AVPixelFormat targetPixelFormat) { | ||
| TORCH_CHECK( | ||
| tensor.dim() == 3 && tensor.size(0) == 3, | ||
| "Expected 3D RGB tensor (CHW format), got shape: ", | ||
|
|
@@ -427,25 +477,48 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( | |
| torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous(); | ||
|
|
||
| NppiSize oSizeROI = {width, height}; | ||
| NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( | ||
| static_cast<const Npp8u*>(hwcFrame.data_ptr()), | ||
| validateInt64ToInt( | ||
| hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"), | ||
| avFrame->data, | ||
| avFrame->linesize, | ||
| oSizeROI, | ||
| defaultLimitedRangeRgbToNv12, | ||
| *nppCtx_); | ||
| NppStatus status; | ||
| switch (targetPixelFormat) { | ||
| case AV_PIX_FMT_NV12: | ||
| status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( | ||
| static_cast<const Npp8u*>(hwcFrame.data_ptr()), | ||
| hwcFrame.stride(0) * hwcFrame.element_size(), | ||
| avFrame->data, | ||
| avFrame->linesize, | ||
| oSizeROI, | ||
| getConversionMatrix(codecContext), | ||
| *nppCtx_); | ||
| break; | ||
| case AV_PIX_FMT_YUV420P: | ||
| status = nppiRGBToYUV420_8u_ColorTwist32f_C3P3R_Ctx( | ||
| static_cast<const Npp8u*>(hwcFrame.data_ptr()), | ||
| hwcFrame.stride(0) * hwcFrame.element_size(), | ||
| avFrame->data, | ||
| avFrame->linesize, | ||
| oSizeROI, | ||
| getConversionMatrix(codecContext), | ||
| *nppCtx_); | ||
| break; | ||
| default: | ||
| TORCH_CHECK( | ||
| false, | ||
| "CUDA encoding only supports NV12 and YUV420P formats, got ", | ||
| av_get_pix_fmt_name(targetPixelFormat)); | ||
| } | ||
|
||
|
|
||
| TORCH_CHECK( | ||
| status == NPP_SUCCESS, | ||
| "Failed to convert RGB to NV12: NPP error code ", | ||
| "Failed to convert RGB to ", | ||
| av_get_pix_fmt_name(targetPixelFormat), | ||
| ": NPP error code ", | ||
| status); | ||
|
|
||
| // TODO-VideoEncoder: Enable configuration of color properties, similar to | ||
| // FFmpeg. Below are the default color properties used by FFmpeg. | ||
| avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601 | ||
| avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range | ||
| avFrame->colorspace = codecContext->colorspace != AVCOL_SPC_UNSPECIFIED | ||
| ? codecContext->colorspace | ||
| : AVCOL_SPC_BT470BG; // BT.601 | ||
| avFrame->color_range = codecContext->color_range != AVCOL_RANGE_UNSPECIFIED | ||
| ? codecContext->color_range | ||
| : AVCOL_RANGE_MPEG; // limited range | ||
|
|
||
| return avFrame; | ||
| } | ||
|
|
@@ -454,7 +527,8 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( | |
| // to enable encoding with CUDA device. The hw_frames_ctx field is needed by | ||
| // FFmpeg to allocate frames on GPU's memory. | ||
| void CudaDeviceInterface::setupHardwareFrameContextForEncoding( | ||
| AVCodecContext* codecContext) { | ||
| AVCodecContext* codecContext, | ||
| AVPixelFormat targetPixelFormat) { | ||
| TORCH_CHECK(codecContext != nullptr, "codecContext is null"); | ||
| TORCH_CHECK( | ||
| hardwareDeviceCtx_, "Hardware device context has not been initialized"); | ||
|
|
@@ -464,9 +538,7 @@ void CudaDeviceInterface::setupHardwareFrameContextForEncoding( | |
| hwFramesCtxRef != nullptr, | ||
| "Failed to allocate hardware frames context for codec"); | ||
|
|
||
| // TODO-VideoEncoder: Enable user set pixel formats to be set | ||
| // (outPixelFormat_) and handled with the appropriate NPP function | ||
| codecContext->sw_pix_fmt = AV_PIX_FMT_NV12; | ||
| codecContext->sw_pix_fmt = targetPixelFormat; | ||
| // Always set pixel format to support CUDA encoding. | ||
| codecContext->pix_fmt = AV_PIX_FMT_CUDA; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -775,23 +775,30 @@ void VideoEncoder::initializeEncoder( | |
| outHeight_ = inHeight_; | ||
|
|
||
| if (videoStreamOptions.pixelFormat.has_value()) { | ||
| if (frames_.device().is_cuda()) { | ||
| if (frames_.device().is_cuda() && | ||
| !(outPixelFormat_ == AV_PIX_FMT_NV12 || | ||
| outPixelFormat_ != AV_PIX_FMT_YUV420P)) { | ||
| TORCH_CHECK( | ||
| false, | ||
| "GPU Video encoding currently only supports the NV12 pixel format. " | ||
| "Do not set pixel_format to use NV12."); | ||
| "GPU encoding only supports NV12 and YUV420P formats, got ", | ||
| av_get_pix_fmt_name(outPixelFormat_)); | ||
| } | ||
| outPixelFormat_ = | ||
| validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value()); | ||
| } else { | ||
| const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec); | ||
| // Use first listed pixel format as default (often yuv420p). | ||
| // This is similar to FFmpeg's logic: | ||
| // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087 | ||
| // If pixel formats are undefined for some reason, try yuv420p | ||
| outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE) | ||
| ? formats[0] | ||
| : AV_PIX_FMT_YUV420P; | ||
| if (frames_.device().is_cuda()) { | ||
| // Default to YUV420P for CUDA encoding if unset. | ||
| outPixelFormat_ = AV_PIX_FMT_YUV420P; | ||
| } else { | ||
| const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec); | ||
| // Use first listed pixel format as default (often yuv420p). | ||
| // This is similar to FFmpeg's logic: | ||
| // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087 | ||
| // If pixel formats are undefined for some reason, try yuv420p | ||
| outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE) | ||
| ? formats[0] | ||
| : AV_PIX_FMT_YUV420P; | ||
| } | ||
| } | ||
|
|
||
| // Configure codec parameters | ||
|
|
@@ -837,7 +844,7 @@ void VideoEncoder::initializeEncoder( | |
| if (frames_.device().is_cuda() && deviceInterface_) { | ||
| deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get()); | ||
| deviceInterface_->setupHardwareFrameContextForEncoding( | ||
| avCodecContext_.get()); | ||
| avCodecContext_.get(), outPixelFormat_); | ||
|
||
| } | ||
|
|
||
| int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions); | ||
|
|
@@ -883,7 +890,7 @@ void VideoEncoder::encode() { | |
| UniqueAVFrame avFrame; | ||
| if (frames_.device().is_cuda() && deviceInterface_) { | ||
| auto cudaFrame = deviceInterface_->convertCUDATensorToAVFrameForEncoding( | ||
| currFrame, i, avCodecContext_.get()); | ||
| currFrame, i, avCodecContext_.get(), outPixelFormat_); | ||
| TORCH_CHECK( | ||
| cudaFrame != nullptr, | ||
| "convertCUDATensorToAVFrameForEncoding failed for frame ", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -780,9 +780,9 @@ def test_pixel_format_errors(self, method, device, tmp_path): | |
| if device == "cuda": | ||
| with pytest.raises( | ||
| RuntimeError, | ||
| match="GPU Video encoding currently only supports the NV12 pixel format. Do not set pixel_format to use NV12", | ||
| match="GPU encoding currently only supports NV12 and YUV420P pixel formats, got yuv444p", | ||
| ): | ||
| getattr(encoder, method)(**valid_params, pixel_format="yuv420p") | ||
| getattr(encoder, method)(**valid_params, pixel_format="yuv444p") | ||
| return | ||
|
|
||
| with pytest.raises( | ||
|
|
@@ -1353,8 +1353,14 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range | |
| ], | ||
| ) | ||
| @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like")) | ||
| # TODO-VideoEncoder: Enable additional pixel formats ("yuv420p", "yuv444p") | ||
| def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method): | ||
| @pytest.mark.parametrize("pixel_format", ("nv12", "yuv420p", None)) | ||
| # BT.601, BT.709, BT.2020 | ||
| @pytest.mark.parametrize("color_space", ("bt470bg", "bt709", "bt2020nc")) | ||
| # Full/PC range, Limited/TV range | ||
| @pytest.mark.parametrize("color_range", ("pc", "tv")) | ||
| def test_nvenc_against_ffmpeg_cli( | ||
| self, tmp_path, format_codec, method, pixel_format, color_space, color_range | ||
| ): | ||
| # Encode with FFmpeg CLI using nvenc codecs | ||
| format, codec = format_codec | ||
| device = "cuda" | ||
|
|
@@ -1385,7 +1391,14 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method): | |
| codec, # Use specified NVENC hardware encoder | ||
| ] | ||
|
|
||
| ffmpeg_cmd.extend(["-pix_fmt", "nv12"]) # Output format is always NV12 | ||
| if color_space: | ||
| ffmpeg_cmd.extend(["-colorspace", color_space]) | ||
| if color_range: | ||
| ffmpeg_cmd.extend(["-color_range", color_range]) | ||
| if pixel_format: | ||
| ffmpeg_cmd.extend(["-pix_fmt", pixel_format]) | ||
| else: # VideoEncoder will default to yuv420p for nvenc codecs | ||
| ffmpeg_cmd.extend(["-pix_fmt", "yuv420p"]) | ||
| if codec == "av1_nvenc": | ||
| ffmpeg_cmd.extend(["-rc", "constqp"]) # Set rate control mode for AV1 | ||
| ffmpeg_cmd.extend(["-qp", str(qp)]) # Use lossless qp for other codecs | ||
|
|
@@ -1396,18 +1409,24 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method): | |
| encoder_extra_options = {"qp": qp} | ||
| if codec == "av1_nvenc": | ||
| encoder_extra_options["rc"] = 0 # constqp mode | ||
| if color_space: | ||
| encoder_extra_options["colorspace"] = color_space | ||
| if color_range: | ||
| encoder_extra_options["color_range"] = color_range | ||
| if method == "to_file": | ||
| encoder_output_path = str(tmp_path / f"nvenc_output.{format}") | ||
| encoder.to_file( | ||
| dest=encoder_output_path, | ||
| codec=codec, | ||
| pixel_format=pixel_format, | ||
| extra_options=encoder_extra_options, | ||
| ) | ||
| encoder_output = encoder_output_path | ||
| elif method == "to_tensor": | ||
| encoder_output = encoder.to_tensor( | ||
| format=format, | ||
| codec=codec, | ||
| pixel_format=pixel_format, | ||
| extra_options=encoder_extra_options, | ||
| ) | ||
| elif method == "to_file_like": | ||
|
|
@@ -1416,6 +1435,7 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method): | |
| file_like=file_like, | ||
| format=format, | ||
| codec=codec, | ||
| pixel_format=pixel_format, | ||
| extra_options=encoder_extra_options, | ||
| ) | ||
| encoder_output = file_like.getvalue() | ||
|
|
@@ -1426,13 +1446,19 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method): | |
| encoder_frames = self.decode(encoder_output).data | ||
|
|
||
| assert ffmpeg_frames.shape[0] == encoder_frames.shape[0] | ||
| # The combination of full range + bt709 results in worse accuracy | ||
| percentage = 91 if color_range == "full" and color_space == "bt709" else 96 | ||
| for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames): | ||
| assert psnr(ff_frame, enc_frame) > 25 | ||
| assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=96, atol=2) | ||
| assert_tensor_close_on_at_least( | ||
| ff_frame, enc_frame, percentage=percentage, atol=2 | ||
| ) | ||
|
||
|
|
||
| if method == "to_file": | ||
| ffmpeg_metadata = self._get_video_metadata(ffmpeg_encoded_path, ["pix_fmt"]) | ||
| encoder_metadata = self._get_video_metadata(encoder_output, ["pix_fmt"]) | ||
| # pix_fmt nv12 is stored as yuv420p in metadata | ||
| assert encoder_metadata["pix_fmt"] == "yuv420p" | ||
| assert ffmpeg_metadata["pix_fmt"] == "yuv420p" | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| metadata_fields = ["color_range", "color_space"] | ||
| ffmpeg_metadata = self._get_video_metadata( | ||
| ffmpeg_encoded_path, metadata_fields | ||
| ) | ||
| encoder_metadata = self._get_video_metadata(encoder_output, metadata_fields) | ||
| assert encoder_metadata["color_range"] == ffmpeg_metadata["color_range"] | ||
| assert encoder_metadata["color_space"] == ffmpeg_metadata["color_space"] | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Uh oh!
There was an error while loading. Please reload this page.