Add pixel_format to VideoEncoder API (#1027)

Dan-Flores · web-flow · commit 45647a1012ae · 2025-11-10T10:11:20.000-05:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -4,6 +4,10 @@
 #include "Encoder.h"
 #include "torch/types.h"
 
+extern "C" {
+#include <libavutil/pixdesc.h>
+}
+
 namespace facebook::torchcodec {
 
 namespace {
@@ -534,6 +538,36 @@ torch::Tensor validateFrames(const torch::Tensor& frames) {
   return frames.contiguous();
 }
 
+AVPixelFormat validatePixelFormat(
+    const AVCodec& avCodec,
+    const std::string& targetPixelFormat) {
+  AVPixelFormat pixelFormat = av_get_pix_fmt(targetPixelFormat.c_str());
+
+  // Validate that the encoder supports this pixel format
+  const AVPixelFormat* supportedFormats = getSupportedPixelFormats(avCodec);
+  if (supportedFormats != nullptr) {
+    for (int i = 0; supportedFormats[i] != AV_PIX_FMT_NONE; ++i) {
+      if (supportedFormats[i] == pixelFormat) {
+        return pixelFormat;
+      }
+    }
+  }
+
+  std::stringstream errorMsg;
+  // av_get_pix_fmt failed to find a pix_fmt
+  if (pixelFormat == AV_PIX_FMT_NONE) {
+    errorMsg << "Unknown pixel format: " << targetPixelFormat;
+  } else {
+    errorMsg << "Specified pixel format " << targetPixelFormat
+             << " is not supported by the " << avCodec.name << " encoder.";
+  }
+  // Build error message, similar to FFmpeg's error log
+  errorMsg << "\nSupported pixel formats for " << avCodec.name << ":";
+  for (int i = 0; supportedFormats[i] != AV_PIX_FMT_NONE; ++i) {
+    errorMsg << " " << av_get_pix_fmt_name(supportedFormats[i]);
+  }
+  TORCH_CHECK(false, errorMsg.str());
+}
 } // namespace
 
 VideoEncoder::~VideoEncoder() {
@@ -635,15 +669,19 @@ void VideoEncoder::initializeEncoder(
   outWidth_ = inWidth_;
   outHeight_ = inHeight_;
 
-  // TODO-VideoEncoder: Enable other pixel formats
-  // Let FFmpeg choose best pixel format to minimize loss
-  outPixelFormat_ = avcodec_find_best_pix_fmt_of_list(
-      getSupportedPixelFormats(*avCodec), // List of supported formats
-      AV_PIX_FMT_GBRP, // We reorder input to GBRP currently
-      0, // No alpha channel
-      nullptr // Discard conversion loss information
-  );
-  TORCH_CHECK(outPixelFormat_ != -1, "Failed to find best pix fmt")
+  if (videoStreamOptions.pixelFormat.has_value()) {
+    outPixelFormat_ =
+        validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
+  } else {
+    const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
+    // Use first listed pixel format as default (often yuv420p).
+    // This is similar to FFmpeg's logic:
+    // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
+    // If pixel formats are undefined for some reason, try yuv420p
+    outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
+        ? formats[0]
+        : AV_PIX_FMT_YUV420P;
+  }
 
   // Configure codec parameters
   avCodecContext_->codec_id = avCodec->id;
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
@@ -48,6 +48,10 @@ struct VideoStreamOptions {
   // TODO-VideoEncoder: Consider adding other optional fields here
   // (bit rate, gop size, max b frames, preset)
   std::optional<int> crf;
+
+  // Optional pixel format for video encoding (e.g., "yuv420p", "yuv444p")
+  // If not specified, uses codec's default format.
+  std::optional<std::string> pixelFormat;
 };
 
 struct AudioStreamOptions {
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
-      "encode_video_to_file(Tensor frames, int frame_rate, str filename, int? crf=None) -> ()");
+      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, int? crf=None) -> ()");
   m.def(
-      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, int? crf=None) -> Tensor");
+      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, int? crf=None) -> Tensor");
   m.def(
-      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, int? crf=None) -> ()");
+      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, int? crf=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def(
@@ -603,8 +603,10 @@ void encode_video_to_file(
     const at::Tensor& frames,
     int64_t frame_rate,
     std::string_view file_name,
+    std::optional<std::string> pixel_format = std::nullopt,
     std::optional<int64_t> crf = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
   VideoEncoder(
       frames,
@@ -618,9 +620,11 @@ at::Tensor encode_video_to_tensor(
     const at::Tensor& frames,
     int64_t frame_rate,
     std::string_view format,
+    std::optional<std::string> pixel_format = std::nullopt,
     std::optional<int64_t> crf = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
   return VideoEncoder(
              frames,
@@ -636,6 +640,7 @@ void _encode_video_to_file_like(
     int64_t frame_rate,
     std::string_view format,
     int64_t file_like_context,
+    std::optional<std::string> pixel_format = std::nullopt,
     std::optional<int64_t> crf = std::nullopt) {
   auto fileLikeContext =
       reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
@@ -644,6 +649,7 @@ void _encode_video_to_file_like(
   std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
 
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
 
   VideoEncoder encoder(
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -214,6 +214,7 @@ def encode_video_to_file_like(
     format: str,
     file_like: Union[io.RawIOBase, io.BufferedIOBase],
     crf: Optional[int] = None,
+    pixel_format: Optional[str] = None,
 ) -> None:
     """Encode video frames to a file-like object.
 
@@ -223,6 +224,7 @@ def encode_video_to_file_like(
         format: Video format (e.g., "mp4", "mov", "mkv")
         file_like: File-like object that supports write() and seek() methods
         crf: Optional constant rate factor for encoding quality
+        pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
     """
     assert _pybind_ops is not None
 
@@ -231,6 +233,7 @@ def encode_video_to_file_like(
         frame_rate,
         format,
         _pybind_ops.create_file_like_context(file_like, True),  # True means for writing
+        pixel_format,
         crf,
     )
 
@@ -319,7 +322,8 @@ def encode_video_to_file_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     filename: str,
-    crf: Optional[int],
+    crf: Optional[int] = None,
+    pixel_format: Optional[str] = None,
 ) -> None:
     return
 
@@ -329,7 +333,8 @@ def encode_video_to_tensor_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     format: str,
-    crf: Optional[int],
+    crf: Optional[int] = None,
+    pixel_format: Optional[str] = None,
 ) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
 
@@ -341,6 +346,7 @@ def _encode_video_to_file_like_abstract(
     format: str,
     file_like_context: int,
     crf: Optional[int] = None,
+    pixel_format: Optional[str] = None,
 ) -> None:
     return
 
diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Union
+from typing import Optional, Union
 
 import torch
 from torch import Tensor
@@ -35,29 +35,38 @@ def __init__(self, frames: Tensor, *, frame_rate: int):
     def to_file(
         self,
         dest: Union[str, Path],
+        *,
+        pixel_format: Optional[str] = None,
     ) -> None:
         """Encode frames into a file.
 
         Args:
             dest (str or ``pathlib.Path``): The path to the output file, e.g.
                 ``video.mp4``. The extension of the file determines the video
                 container format.
+            pixel_format (str, optional): The pixel format for encoding (e.g.,
+                "yuv420p", "yuv444p"). If not specified, uses codec's default format.
         """
         _core.encode_video_to_file(
             frames=self._frames,
             frame_rate=self._frame_rate,
             filename=str(dest),
+            pixel_format=pixel_format,
         )
 
     def to_tensor(
         self,
         format: str,
+        *,
+        pixel_format: Optional[str] = None,
     ) -> Tensor:
         """Encode frames into raw bytes, as a 1D uint8 Tensor.
 
         Args:
             format (str): The container format of the encoded frames, e.g. "mp4", "mov",
             "mkv", "avi", "webm", "flv", or "gif"
+            pixel_format (str, optional): The pixel format to encode frames into (e.g.,
+                "yuv420p", "yuv444p"). If not specified, uses codec's default format.
 
         Returns:
             Tensor: The raw encoded bytes as 4D uint8 Tensor.
@@ -66,12 +75,15 @@ def to_tensor(
             frames=self._frames,
             frame_rate=self._frame_rate,
             format=format,
+            pixel_format=pixel_format,
         )
 
     def to_file_like(
         self,
         file_like,
         format: str,
+        *,
+        pixel_format: Optional[str] = None,
     ) -> None:
         """Encode frames into a file-like object.
 
@@ -83,10 +95,13 @@ def to_file_like(
                 int = 0) -> int``.
             format (str): The container format of the encoded frames, e.g. "mp4", "mov",
                 "mkv", "avi", "webm", "flv", or "gif".
+            pixel_format (str, optional): The pixel format for encoding (e.g.,
+                "yuv420p", "yuv444p"). If not specified, uses codec's default format.
         """
         _core.encode_video_to_file_like(
             frames=self._frames,
             frame_rate=self._frame_rate,
             format=format,
             file_like=file_like,
+            pixel_format=pixel_format,
         )
diff --git a/test/test_encoders.py b/test/test_encoders.py
@@ -629,6 +629,30 @@ def test_bad_input(self, tmp_path):
         ):
             encoder.to_tensor(format="bad_format")
 
+    @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like"))
+    def test_pixel_format_errors(self, method, tmp_path):
+        frames = torch.zeros((5, 3, 64, 64), dtype=torch.uint8)
+        encoder = VideoEncoder(frames, frame_rate=30)
+
+        if method == "to_file":
+            valid_params = dict(dest=str(tmp_path / "output.mp4"))
+        elif method == "to_tensor":
+            valid_params = dict(format="mp4")
+        elif method == "to_file_like":
+            valid_params = dict(file_like=io.BytesIO(), format="mp4")
+
+        with pytest.raises(
+            RuntimeError,
+            match=r"Unknown pixel format: invalid_pix_fmt[\s\S]*Supported pixel formats.*yuv420p",
+        ):
+            getattr(encoder, method)(**valid_params, pixel_format="invalid_pix_fmt")
+
+        with pytest.raises(
+            RuntimeError,
+            match=r"Specified pixel format rgb24 is not supported[\s\S]*Supported pixel formats.*yuv420p",
+        ):
+            getattr(encoder, method)(**valid_params, pixel_format="rgb24")
+
     @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like"))
     def test_contiguity(self, method, tmp_path):
         # Ensure that 2 sets of video frames with the same pixel values are encoded
diff --git a/test/test_ops.py b/test/test_ops.py