diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 090442f3d..aabe1d75c 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -383,16 +383,120 @@ std::string CudaDeviceInterface::getDetails() { // Below are methods exclusive to video encoding: // -------------------------------------------------------------------------- namespace { -// RGB to NV12 color conversion matrix for BT.601 limited range. -// NPP ColorTwist function used below expects the limited range -// color conversion matrix, and this matches FFmpeg's default behavior. -const Npp32f defaultLimitedRangeRgbToNv12[3][4] = { - // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B) - {0.257f, 0.504f, 0.098f, 16.0f}, - // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients) - {-0.148f, -0.291f, 0.439f, 128.0f}, - // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients) - {0.439f, -0.368f, -0.071f, 128.0f}}; +// Note: [RGB -> YUV Color Conversion, limited color range] +// +// For context on this subject, first read the note: +// [YUV -> RGB Color Conversion, color space and color range] +// https://github.com/meta-pytorch/torchcodec/blob/main/src/torchcodec/_core/CUDACommon.cpp#L63-L65 +// +// Lets encode RGB -> YUV in the limited color range for BT.601 color space. +// In limited range, the [0, 255] range is mapped into [16-235] for Y, and into +// [16-240] for U,V. +// To implement, we get the full range conversion matrix as before, then scale: +// - Y channel: scale by (235-16)/255 = 219/255 +// - U,V channels: scale by (240-16)/255 = 224/255 +// https://en.wikipedia.org/wiki/YCbCr#Y%E2%80%B2PbPr_to_Y%E2%80%B2CbCr +// +// ```py +// import torch +// kr, kg, kb = 0.299, 0.587, 0.114 # BT.601 luma coefficients +// u_scale = 2 * (1 - kb) +// v_scale = 2 * (1 - kr) +// +// rgb_to_yuv_full = torch.tensor([ +// [kr, kg, kb], +// [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale], +// [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale] +// ]) +// +// full_to_limited_y_scale = 219.0 / 255.0 +// full_to_limited_uv_scale = 224.0 / 255.0 +// +// rgb_to_yuv_limited = rgb_to_yuv_full * torch.tensor([ +// [full_to_limited_y_scale], +// [full_to_limited_uv_scale], +// [full_to_limited_uv_scale] +// ]) +// +// print("RGB->YUV matrix (Limited Range BT.601):") +// print(rgb_to_yuv_limited) +// ``` +// +// This yields: +// tensor([[ 0.2568, 0.5041, 0.0979], +// [-0.1482, -0.2910, 0.4392], +// [ 0.4392, -0.3678, -0.0714]]) +// +// Which matches https://fourcc.org/fccyvrgb.php +// +// To perform color conversion in NPP, we are required to provide these color +// conversion matrices to ColorTwist functions, for example, +// `nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx`. +// https://docs.nvidia.com/cuda/npp/image_color_conversion.html +// +// These offsets are added in the 4th column of each conversion matrix below. +// - In limited range, Y is offset by 16 to add the lower margin. +// - In both color ranges, U,V are offset by 128 to be centered around 0. +// +// RGB to YUV conversion matrices to use in NPP color conversion functions +struct ColorConversionMatrices { + static constexpr Npp32f BT601_LIMITED[3][4] = { + {0.2568f, 0.5041f, 0.0979f, 16.0f}, + {-0.1482f, -0.2910f, 0.4392f, 128.0f}, + {0.4392f, -0.3678f, -0.0714f, 128.0f}}; + + static constexpr Npp32f BT601_FULL[3][4] = { + {0.2990f, 0.5870f, 0.1140f, 0.0f}, + {-0.1687f, -0.3313f, 0.5000f, 128.0f}, + {0.5000f, -0.4187f, -0.0813f, 128.0f}}; + + static constexpr Npp32f BT709_LIMITED[3][4] = { + {0.1826f, 0.6142f, 0.0620f, 16.0f}, + {-0.1006f, -0.3386f, 0.4392f, 128.0f}, + {0.4392f, -0.3989f, -0.0403f, 128.0f}}; + + static constexpr Npp32f BT709_FULL[3][4] = { + {0.2126f, 0.7152f, 0.0722f, 0.0f}, + {-0.1146f, -0.3854f, 0.5000f, 128.0f}, + {0.5000f, -0.4542f, -0.0458f, 128.0f}}; + + static constexpr Npp32f BT2020_LIMITED[3][4] = { + {0.2256f, 0.5823f, 0.0509f, 16.0f}, + {-0.1227f, -0.3166f, 0.4392f, 128.0f}, + {0.4392f, -0.4039f, -0.0353f, 128.0f}}; + + static constexpr Npp32f BT2020_FULL[3][4] = { + {0.2627f, 0.6780f, 0.0593f, 0.0f}, + {-0.139630f, -0.360370f, 0.5000f, 128.0f}, + {0.5000f, -0.459786f, -0.040214f, 128.0f}}; +}; + +// Returns conversion matrix based on codec context color space and range +const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] { + if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range + codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) { + if (codecContext->colorspace == AVCOL_SPC_BT470BG) { + return ColorConversionMatrices::BT601_LIMITED; + } else if (codecContext->colorspace == AVCOL_SPC_BT709) { + return ColorConversionMatrices::BT709_LIMITED; + } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) { + return ColorConversionMatrices::BT2020_LIMITED; + } else { // default to BT.601 + return ColorConversionMatrices::BT601_LIMITED; + } + } else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range + if (codecContext->colorspace == AVCOL_SPC_BT470BG) { + return ColorConversionMatrices::BT601_FULL; + } else if (codecContext->colorspace == AVCOL_SPC_BT709) { + return ColorConversionMatrices::BT709_FULL; + } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) { + return ColorConversionMatrices::BT2020_FULL; + } else { // default to BT.601 + return ColorConversionMatrices::BT601_FULL; + } + } + return ColorConversionMatrices::BT601_LIMITED; +} } // namespace UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( @@ -437,26 +541,26 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous(); NppiSize oSizeROI = {width, height}; - NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( + NppStatus status; + // Convert to NV12, as CUDA_ENCODING_PIXEL_FORMAT is always NV12 currently + status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( static_cast(hwcFrame.data_ptr()), - validateInt64ToInt( - hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"), + hwcFrame.stride(0) * hwcFrame.element_size(), avFrame->data, avFrame->linesize, oSizeROI, - defaultLimitedRangeRgbToNv12, + getConversionMatrix(codecContext), *nppCtx_); TORCH_CHECK( status == NPP_SUCCESS, - "Failed to convert RGB to NV12: NPP error code ", + "Failed to convert RGB to ", + av_get_pix_fmt_name(DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT), + ": NPP error code ", status); - // TODO-VideoEncoder: Enable configuration of color properties, similar to - // FFmpeg. Below are the default color properties used by FFmpeg. - avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601 - avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range - + avFrame->colorspace = codecContext->colorspace; + avFrame->color_range = codecContext->color_range; return avFrame; } @@ -474,9 +578,7 @@ void CudaDeviceInterface::setupHardwareFrameContextForEncoding( hwFramesCtxRef != nullptr, "Failed to allocate hardware frames context for codec"); - // TODO-VideoEncoder: Enable user set pixel formats to be set - // (outPixelFormat_) and handled with the appropriate NPP function - codecContext->sw_pix_fmt = AV_PIX_FMT_NV12; + codecContext->sw_pix_fmt = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT; // Always set pixel format to support CUDA encoding. codecContext->pix_fmt = AV_PIX_FMT_CUDA; diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h index 551ffa298..6be7b68a2 100644 --- a/src/torchcodec/_core/DeviceInterface.h +++ b/src/torchcodec/_core/DeviceInterface.h @@ -139,6 +139,9 @@ class DeviceInterface { return ""; } + // Pixel format used for encoding on CUDA devices + static constexpr AVPixelFormat CUDA_ENCODING_PIXEL_FORMAT = AV_PIX_FMT_NV12; + // Function used for video encoding, only implemented in CudaDeviceInterface. // It is here to isolate CUDA dependencies from CPU builds // TODO Video-Encoder: Reconsider using video encoding functions in device diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp index d210b430c..73ea98b92 100644 --- a/src/torchcodec/_core/Encoder.cpp +++ b/src/torchcodec/_core/Encoder.cpp @@ -782,23 +782,30 @@ void VideoEncoder::initializeEncoder( outHeight_ = inHeight_; if (videoStreamOptions.pixelFormat.has_value()) { + // TODO-VideoEncoder: Enable pixel formats to be set by user + // and handled with the appropriate NPP function on GPU. if (frames_.device().is_cuda()) { TORCH_CHECK( false, - "GPU Video encoding currently only supports the NV12 pixel format. " - "Do not set pixel_format to use NV12."); + "Video encoding on GPU currently only supports the nv12 pixel format. " + "Do not set pixel_format to use nv12 by default."); } outPixelFormat_ = validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value()); } else { - const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec); - // Use first listed pixel format as default (often yuv420p). - // This is similar to FFmpeg's logic: - // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087 - // If pixel formats are undefined for some reason, try yuv420p - outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE) - ? formats[0] - : AV_PIX_FMT_YUV420P; + if (frames_.device().is_cuda()) { + // Default to nv12 pixel format when encoding on GPU. + outPixelFormat_ = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT; + } else { + const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec); + // Use first listed pixel format as default (often yuv420p). + // This is similar to FFmpeg's logic: + // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087 + // If pixel formats are undefined for some reason, try yuv420p + outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE) + ? formats[0] + : AV_PIX_FMT_YUV420P; + } } // Configure codec parameters diff --git a/test/test_encoders.py b/test/test_encoders.py index 63dcc5add..078206f95 100644 --- a/test/test_encoders.py +++ b/test/test_encoders.py @@ -777,9 +777,9 @@ def test_pixel_format_errors(self, method, device, tmp_path): if device == "cuda": with pytest.raises( RuntimeError, - match="GPU Video encoding currently only supports the NV12 pixel format. Do not set pixel_format to use NV12", + match="Video encoding on GPU currently only supports the nv12 pixel format. Do not set pixel_format to use nv12 by default.", ): - getattr(encoder, method)(**valid_params, pixel_format="yuv420p") + getattr(encoder, method)(**valid_params, pixel_format="yuv444p") return with pytest.raises( @@ -1345,13 +1345,37 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range pytest.param( "mkv", "av1_nvenc", - marks=pytest.mark.skipif( - IN_GITHUB_CI, reason="av1_nvenc is not supported on CI" - ), + marks=[ + pytest.mark.skipif( + IN_GITHUB_CI, reason="av1_nvenc is not supported on CI" + ), + pytest.mark.skipif( + get_ffmpeg_major_version() == 4, + reason="av1_nvenc is not supported on FFmpeg 4", + ), + ], ), ], ) - def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec): + # We test the color space and color range parameters in this test, because + # we are required to define matrices specific to these specs when using NPP, see note: + # [RGB -> YUV Color Conversion, limited color range] + # BT.601, BT.709, BT.2020 + @pytest.mark.parametrize("color_space", ("bt470bg", "bt709", "bt2020nc", None)) + # Full/PC range, Limited/TV range + @pytest.mark.parametrize("color_range", ("pc", "tv", None)) + def test_nvenc_against_ffmpeg_cli( + self, tmp_path, method, format, codec, color_space, color_range + ): + ffmpeg_version = get_ffmpeg_major_version() + # TODO-VideoEncoder: Investigate why FFmpeg 4 and 6 fail with non-default color space and range. + # See https://github.com/meta-pytorch/torchcodec/issues/1140 + if ffmpeg_version in (4, 6) and not ( + color_space == "bt470bg" and color_range == "tv" + ): + pytest.skip( + "Non-default color space and range have lower accuracy on FFmpeg 4 and 6" + ) # Encode with FFmpeg CLI using nvenc codecs device = "cuda" qp = 1 # Use near lossless encoding to reduce noise and support av1_nvenc @@ -1379,16 +1403,23 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec): temp_raw_path, ] # CLI requires explicit codec for nvenc + # VideoEncoder will default to h264_nvenc since the frames are on GPU. ffmpeg_cmd.extend(["-c:v", codec if codec is not None else "h264_nvenc"]) - # VideoEncoder will select an NVENC encoder by default since the frames are on GPU. - ffmpeg_cmd.extend(["-pix_fmt", "nv12"]) # Output format is always NV12 ffmpeg_cmd.extend(["-qp", str(qp)]) + if color_space: + ffmpeg_cmd.extend(["-colorspace", color_space]) + if color_range: + ffmpeg_cmd.extend(["-color_range", color_range]) ffmpeg_cmd.extend([ffmpeg_encoded_path]) subprocess.run(ffmpeg_cmd, check=True, capture_output=True) encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate) encoder_extra_options = {"qp": qp} + if color_space: + encoder_extra_options["colorspace"] = color_space + if color_range: + encoder_extra_options["color_range"] = color_range if method == "to_file": encoder_output_path = str(tmp_path / f"nvenc_output.{format}") encoder.to_file( @@ -1424,8 +1455,39 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec): assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=96, atol=2) if method == "to_file": - ffmpeg_metadata = self._get_video_metadata(ffmpeg_encoded_path, ["pix_fmt"]) - encoder_metadata = self._get_video_metadata(encoder_output, ["pix_fmt"]) - # pix_fmt nv12 is stored as yuv420p in metadata - assert encoder_metadata["pix_fmt"] == "yuv420p" - assert ffmpeg_metadata["pix_fmt"] == "yuv420p" + metadata_fields = ["pix_fmt", "color_range", "color_space"] + ffmpeg_metadata = self._get_video_metadata( + ffmpeg_encoded_path, metadata_fields + ) + encoder_metadata = self._get_video_metadata(encoder_output, metadata_fields) + # pix_fmt nv12 is stored as yuv420p in metadata, unless full range (pc)is used + # In that case, h264 and hevc NVENC codecs will use yuvj420p automatically. + if color_range == "pc" and codec != "av1_nvenc": + expected_pix_fmt = "yuvj420p" + else: + # av1_nvenc does not utilize the yuvj420p pixel format + expected_pix_fmt = "yuv420p" + assert ( + encoder_metadata["pix_fmt"] + == ffmpeg_metadata["pix_fmt"] + == expected_pix_fmt + ) + + assert encoder_metadata["color_range"] == ffmpeg_metadata["color_range"] + assert encoder_metadata["color_space"] == ffmpeg_metadata["color_space"] + # Default values vary by codec, so we only assert when + # color_range and color_space are not None. + if color_range is not None: + # FFmpeg and torchcodec encode color_range as 'unknown' for mov and avi + # when color_range='tv' and color_space=None on FFmpeg 7/8. + # Since this failure is rare, I suspect its a bug related to these + # older container formats on newer FFmpeg versions. + if not ( + ffmpeg_version in (7, 8) + and color_range == "tv" + and color_space is None + and format in ("mov", "avi") + ): + assert color_range == encoder_metadata["color_range"] + if color_space is not None: + assert color_space == encoder_metadata["color_space"]