-
Notifications
You must be signed in to change notification settings - Fork 89
Add color handling to VideoEncoder GPU #1125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
cd5f8aa
939240b
b538d13
0480f1f
351a55d
ffcf872
a671f31
a9ad8e0
261549d
f4d777c
da3a6d7
1dc7690
ce86d61
daf2fda
8c2bcee
193d7c9
e83c130
e63f118
5e8745f
d1cbaeb
ef03ad5
40f4c66
e28081f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -380,22 +380,76 @@ std::string CudaDeviceInterface::getDetails() { | |
| // Below are methods exclusive to video encoding: | ||
| // -------------------------------------------------------------------------- | ||
| namespace { | ||
| // RGB to NV12 color conversion matrix for BT.601 limited range. | ||
| // NPP ColorTwist function used below expects the limited range | ||
| // color conversion matrix, and this matches FFmpeg's default behavior. | ||
| const Npp32f defaultLimitedRangeRgbToNv12[3][4] = { | ||
| // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B) | ||
| {0.257f, 0.504f, 0.098f, 16.0f}, | ||
| // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients) | ||
| {-0.148f, -0.291f, 0.439f, 128.0f}, | ||
| // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients) | ||
| {0.439f, -0.368f, -0.071f, 128.0f}}; | ||
| // For background on these matrices, see the note: | ||
| // [YUV -> RGB Color Conversion, color space and color range] | ||
| // https://github.com/meta-pytorch/torchcodec/blob/main/src/torchcodec/_core/CUDACommon.cpp#L63-L65 | ||
| // TODO Video-Encoder: Extend note to explain limited vs full range | ||
| // RGB to YUV conversion matrices to use in NPP color conversion functions | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the note, it will be super useful if we ever need to go back to this in the future. |
||
| struct ColorConversionMatrices { | ||
| static constexpr Npp32f BT601_LIMITED[3][4] = { | ||
| {0.257f, 0.504f, 0.098f, 16.0f}, | ||
| {-0.148f, -0.291f, 0.439f, 128.0f}, | ||
| {0.439f, -0.368f, -0.071f, 128.0f}}; | ||
|
|
||
| static constexpr Npp32f BT601_FULL[3][4] = { | ||
| {0.299f, 0.587f, 0.114f, 0.0f}, | ||
| {-0.168736f, -0.331264f, 0.5f, 128.0f}, | ||
| {0.5f, -0.418688f, -0.081312f, 128.0f}}; | ||
|
|
||
| static constexpr Npp32f BT709_LIMITED[3][4] = { | ||
| {0.183f, 0.614f, 0.062f, 16.0f}, | ||
| {-0.101f, -0.338f, 0.439f, 128.0f}, | ||
| {0.439f, -0.399f, -0.040f, 128.0f}}; | ||
|
|
||
| static constexpr Npp32f BT709_FULL[3][4] = { | ||
| {0.2126f, 0.7152f, 0.0722f, 0.0f}, | ||
| {-0.114572f, -0.385428f, 0.5f, 128.0f}, | ||
| {0.5f, -0.454153f, -0.045847f, 128.0f}}; | ||
|
|
||
| static constexpr Npp32f BT2020_LIMITED[3][4] = { | ||
| {0.2256f, 0.5823f, 0.0509f, 16.0f}, | ||
| {-0.122f, -0.315f, 0.439f, 128.0f}, | ||
| {0.439f, -0.403f, -0.036f, 128.0f}}; | ||
|
|
||
| static constexpr Npp32f BT2020_FULL[3][4] = { | ||
| {0.2627f, 0.6780f, 0.0593f, 0.0f}, | ||
| {-0.139630f, -0.360370f, 0.5f, 128.0f}, | ||
| {0.5f, -0.459786f, -0.040214f, 128.0f}}; | ||
| }; | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // Returns conversion matrix based on codec context color space and range | ||
| const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] { | ||
| if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range | ||
| codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) { | ||
| if (codecContext->colorspace == AVCOL_SPC_BT470BG) { | ||
| return ColorConversionMatrices::BT601_LIMITED; | ||
| } else if (codecContext->colorspace == AVCOL_SPC_BT709) { | ||
| return ColorConversionMatrices::BT709_LIMITED; | ||
| } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) { | ||
| return ColorConversionMatrices::BT2020_LIMITED; | ||
| } else { // default to BT.601 | ||
| return ColorConversionMatrices::BT601_LIMITED; | ||
| } | ||
| } else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range | ||
| if (codecContext->colorspace == AVCOL_SPC_BT470BG) { | ||
| return ColorConversionMatrices::BT601_FULL; | ||
| } else if (codecContext->colorspace == AVCOL_SPC_BT709) { | ||
| return ColorConversionMatrices::BT709_FULL; | ||
| } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) { | ||
| return ColorConversionMatrices::BT2020_FULL; | ||
| } else { // default to BT.601 | ||
| return ColorConversionMatrices::BT601_FULL; | ||
| } | ||
| } | ||
| return ColorConversionMatrices::BT601_LIMITED; | ||
| } | ||
| } // namespace | ||
|
|
||
| UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( | ||
| const torch::Tensor& tensor, | ||
| int frameIndex, | ||
| AVCodecContext* codecContext) { | ||
| AVCodecContext* codecContext, | ||
| AVPixelFormat targetPixelFormat) { | ||
| TORCH_CHECK( | ||
| tensor.dim() == 3 && tensor.size(0) == 3, | ||
| "Expected 3D RGB tensor (CHW format), got shape: ", | ||
|
|
@@ -434,34 +488,44 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( | |
| torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous(); | ||
|
|
||
| NppiSize oSizeROI = {width, height}; | ||
| NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( | ||
| static_cast<const Npp8u*>(hwcFrame.data_ptr()), | ||
| validateInt64ToInt( | ||
| hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"), | ||
| avFrame->data, | ||
| avFrame->linesize, | ||
| oSizeROI, | ||
| defaultLimitedRangeRgbToNv12, | ||
| *nppCtx_); | ||
| NppStatus status; | ||
| switch (targetPixelFormat) { | ||
| case AV_PIX_FMT_NV12: | ||
| status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( | ||
| static_cast<const Npp8u*>(hwcFrame.data_ptr()), | ||
| hwcFrame.stride(0) * hwcFrame.element_size(), | ||
| avFrame->data, | ||
| avFrame->linesize, | ||
| oSizeROI, | ||
| getConversionMatrix(codecContext), | ||
| *nppCtx_); | ||
| break; | ||
| default: | ||
| TORCH_CHECK( | ||
| false, | ||
| "GPU encoding expected to encode into nv12 pixel format, but got ", | ||
| av_get_pix_fmt_name(targetPixelFormat), | ||
| ". This should not happen, please report this to the TorchCodec repo"); | ||
| } | ||
|
||
|
|
||
| TORCH_CHECK( | ||
| status == NPP_SUCCESS, | ||
| "Failed to convert RGB to NV12: NPP error code ", | ||
| "Failed to convert RGB to ", | ||
| av_get_pix_fmt_name(targetPixelFormat), | ||
| ": NPP error code ", | ||
| status); | ||
|
|
||
| // TODO-VideoEncoder: Enable configuration of color properties, similar to | ||
| // FFmpeg. Below are the default color properties used by FFmpeg. | ||
| avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601 | ||
| avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range | ||
|
|
||
| avFrame->colorspace = codecContext->colorspace; | ||
| avFrame->color_range = codecContext->color_range; | ||
| return avFrame; | ||
| } | ||
|
|
||
| // Allocates and initializes AVHWFramesContext, and sets pixel format fields | ||
| // to enable encoding with CUDA device. The hw_frames_ctx field is needed by | ||
| // FFmpeg to allocate frames on GPU's memory. | ||
| void CudaDeviceInterface::setupHardwareFrameContextForEncoding( | ||
| AVCodecContext* codecContext) { | ||
| AVCodecContext* codecContext, | ||
| AVPixelFormat targetPixelFormat) { | ||
| TORCH_CHECK(codecContext != nullptr, "codecContext is null"); | ||
| TORCH_CHECK( | ||
| hardwareDeviceCtx_, "Hardware device context has not been initialized"); | ||
|
|
@@ -471,9 +535,7 @@ void CudaDeviceInterface::setupHardwareFrameContextForEncoding( | |
| hwFramesCtxRef != nullptr, | ||
| "Failed to allocate hardware frames context for codec"); | ||
|
|
||
| // TODO-VideoEncoder: Enable user set pixel formats to be set | ||
| // (outPixelFormat_) and handled with the appropriate NPP function | ||
| codecContext->sw_pix_fmt = AV_PIX_FMT_NV12; | ||
| codecContext->sw_pix_fmt = targetPixelFormat; | ||
| // Always set pixel format to support CUDA encoding. | ||
| codecContext->pix_fmt = AV_PIX_FMT_CUDA; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -790,23 +790,30 @@ void VideoEncoder::initializeEncoder( | |
| outHeight_ = inHeight_; | ||
|
|
||
| if (videoStreamOptions.pixelFormat.has_value()) { | ||
| // TODO-VideoEncoder: Enable pixel formats to be set by user | ||
| // and handled with the appropriate NPP function on GPU. | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved this TODO from The behavior is unchanged: If
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are we raising an error for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Essentially yes. Because we do not understand the codec's behavior yet, we do not want the user to set or expect a pixel format. |
||
| if (frames_.device().is_cuda()) { | ||
| TORCH_CHECK( | ||
| false, | ||
| "GPU Video encoding currently only supports the NV12 pixel format. " | ||
| "Do not set pixel_format to use NV12."); | ||
| "Video encoding on GPU currently only supports the nv12 pixel format. " | ||
| "Do not set pixel_format to use nv12 by default."); | ||
| } | ||
| outPixelFormat_ = | ||
| validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value()); | ||
| } else { | ||
| const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec); | ||
| // Use first listed pixel format as default (often yuv420p). | ||
| // This is similar to FFmpeg's logic: | ||
| // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087 | ||
| // If pixel formats are undefined for some reason, try yuv420p | ||
| outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE) | ||
| ? formats[0] | ||
| : AV_PIX_FMT_YUV420P; | ||
| if (frames_.device().is_cuda()) { | ||
| // Default to nv12 pixel format when encoding on GPU. | ||
| outPixelFormat_ = AV_PIX_FMT_NV12; | ||
|
||
| } else { | ||
| const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec); | ||
| // Use first listed pixel format as default (often yuv420p). | ||
| // This is similar to FFmpeg's logic: | ||
| // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087 | ||
| // If pixel formats are undefined for some reason, try yuv420p | ||
| outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE) | ||
| ? formats[0] | ||
| : AV_PIX_FMT_YUV420P; | ||
| } | ||
| } | ||
|
|
||
| // Configure codec parameters | ||
|
|
@@ -852,7 +859,7 @@ void VideoEncoder::initializeEncoder( | |
| if (frames_.device().is_cuda() && deviceInterface_) { | ||
| deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get()); | ||
| deviceInterface_->setupHardwareFrameContextForEncoding( | ||
| avCodecContext_.get()); | ||
| avCodecContext_.get(), outPixelFormat_); | ||
|
||
| } | ||
|
|
||
| int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions); | ||
|
|
@@ -898,7 +905,7 @@ void VideoEncoder::encode() { | |
| UniqueAVFrame avFrame; | ||
| if (frames_.device().is_cuda() && deviceInterface_) { | ||
| auto cudaFrame = deviceInterface_->convertCUDATensorToAVFrameForEncoding( | ||
| currFrame, i, avCodecContext_.get()); | ||
| currFrame, i, avCodecContext_.get(), outPixelFormat_); | ||
| TORCH_CHECK( | ||
| cudaFrame != nullptr, | ||
| "convertCUDATensorToAVFrameForEncoding failed for frame ", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -780,9 +780,9 @@ def test_pixel_format_errors(self, method, device, tmp_path): | |
| if device == "cuda": | ||
| with pytest.raises( | ||
| RuntimeError, | ||
| match="GPU Video encoding currently only supports the NV12 pixel format. Do not set pixel_format to use NV12", | ||
| match="Video encoding on GPU currently only supports the nv12 pixel format. Do not set pixel_format to use nv12 by default.", | ||
| ): | ||
| getattr(encoder, method)(**valid_params, pixel_format="yuv420p") | ||
| getattr(encoder, method)(**valid_params, pixel_format="yuv444p") | ||
| return | ||
|
|
||
| with pytest.raises( | ||
|
|
@@ -1354,7 +1354,24 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range | |
| ), | ||
| ], | ||
| ) | ||
| def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec): | ||
| # BT.601, BT.709, BT.2020 | ||
| @pytest.mark.parametrize("color_space", ("bt470bg", "bt709", "bt2020nc", None)) | ||
| # Full/PC range, Limited/TV range | ||
| @pytest.mark.parametrize("color_range", ("pc", "tv", None)) | ||
| def test_nvenc_against_ffmpeg_cli( | ||
| self, tmp_path, method, format, codec, color_space, color_range | ||
| ): | ||
| ffmpeg_version = get_ffmpeg_major_version() | ||
| # TODO-VideoEncoder: Investigate why FFmpeg 4 and 6 fail with non-default color space and range. | ||
| # See https://github.com/meta-pytorch/torchcodec/issues/1140 | ||
| if ffmpeg_version in (4, 6) and not ( | ||
| color_space == "bt470bg" and color_range == "tv" | ||
| ): | ||
| pytest.skip( | ||
| "Non-default color space and range have lower accuracy on FFmpeg 4 and 6" | ||
| ) | ||
| if ffmpeg_version == 4 and codec == "av1_nvenc": | ||
| pytest.skip("av1_nvenc is not supported on FFmpeg 4") | ||
|
||
| # Encode with FFmpeg CLI using nvenc codecs | ||
| device = "cuda" | ||
| qp = 1 # Use near lossless encoding to reduce noise and support av1_nvenc | ||
|
|
@@ -1382,16 +1399,23 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec): | |
| temp_raw_path, | ||
| ] | ||
| # CLI requires explicit codec for nvenc | ||
| # VideoEncoder will default to h264_nvenc since the frames are on GPU. | ||
| ffmpeg_cmd.extend(["-c:v", codec if codec is not None else "h264_nvenc"]) | ||
| # VideoEncoder will select an NVENC encoder by default since the frames are on GPU. | ||
|
|
||
| ffmpeg_cmd.extend(["-pix_fmt", "nv12"]) # Output format is always NV12 | ||
| ffmpeg_cmd.extend(["-qp", str(qp)]) | ||
| ffmpeg_cmd.extend(["-qp", str(qp)]) # Use lossless qp for other codecs | ||
Dan-Flores marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if color_space: | ||
| ffmpeg_cmd.extend(["-colorspace", color_space]) | ||
| if color_range: | ||
| ffmpeg_cmd.extend(["-color_range", color_range]) | ||
| ffmpeg_cmd.extend([ffmpeg_encoded_path]) | ||
| subprocess.run(ffmpeg_cmd, check=True, capture_output=True) | ||
|
|
||
| encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate) | ||
| encoder_extra_options = {"qp": qp} | ||
| if color_space: | ||
| encoder_extra_options["colorspace"] = color_space | ||
| if color_range: | ||
| encoder_extra_options["color_range"] = color_range | ||
| if method == "to_file": | ||
| encoder_output_path = str(tmp_path / f"nvenc_output.{format}") | ||
| encoder.to_file( | ||
|
|
@@ -1422,13 +1446,37 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec): | |
| encoder_frames = self.decode(encoder_output).data | ||
|
|
||
| assert ffmpeg_frames.shape[0] == encoder_frames.shape[0] | ||
| # The combination of full range + bt709 results in worse accuracy | ||
| percentage = 91 if color_range == "full" and color_space == "bt709" else 96 | ||
| for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames): | ||
| assert psnr(ff_frame, enc_frame) > 25 | ||
| assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=96, atol=2) | ||
| assert_tensor_close_on_at_least( | ||
| ff_frame, enc_frame, percentage=percentage, atol=2 | ||
| ) | ||
|
||
|
|
||
| if method == "to_file": | ||
| ffmpeg_metadata = self._get_video_metadata(ffmpeg_encoded_path, ["pix_fmt"]) | ||
| encoder_metadata = self._get_video_metadata(encoder_output, ["pix_fmt"]) | ||
| # pix_fmt nv12 is stored as yuv420p in metadata | ||
| assert encoder_metadata["pix_fmt"] == "yuv420p" | ||
| assert ffmpeg_metadata["pix_fmt"] == "yuv420p" | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| metadata_fields = ["pix_fmt", "color_range", "color_space"] | ||
| ffmpeg_metadata = self._get_video_metadata( | ||
| ffmpeg_encoded_path, metadata_fields | ||
| ) | ||
| encoder_metadata = self._get_video_metadata(encoder_output, metadata_fields) | ||
| # pix_fmt nv12 is stored as yuv420p in metadata, unless full range (pc)is used | ||
| # In that case, h264 and hevc NVENC codecs will use yuvj420p automatically. | ||
| if color_range == "pc" and codec != "av1_nvenc": | ||
| expected_pix_fmt = "yuvj420p" | ||
| else: | ||
| # av1_nvenc does not utilize the yuvj420p pixel format | ||
| expected_pix_fmt = "yuv420p" | ||
| assert ( | ||
| encoder_metadata["pix_fmt"] | ||
| == ffmpeg_metadata["pix_fmt"] | ||
| == expected_pix_fmt | ||
| ) | ||
| assert encoder_metadata["color_range"] == ffmpeg_metadata["color_range"] | ||
| assert encoder_metadata["color_space"] == ffmpeg_metadata["color_space"] | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Default values vary by codec, so we only assert when | ||
| # color_range and color_space are not None. | ||
| if color_range is not None: | ||
| color_range = ffmpeg_metadata["color_range"] | ||
|
||
| if color_space is not None: | ||
| assert color_space == ffmpeg_metadata["color_space"] | ||
Uh oh!
There was an error while loading. Please reload this page.