Merge branch 'main' of github.com:pytorch/torchcodec into refactor_metadata_order

scotts · scotts · commit 78328348a4cf · 2025-11-18T10:51:59.000-08:00
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-[**Installation**](#installing-torchcodec) | [**Simple Example**](#using-torchcodec) | [**Detailed Example**](https://pytorch.org/torchcodec/stable/generated_examples/) | [**Documentation**](https://pytorch.org/torchcodec) | [**Contributing**](CONTRIBUTING.md) | [**License**](#license)
+[**Installation**](#installing-torchcodec) | [**Simple Example**](#using-torchcodec) | [**Detailed Example**](https://meta-pytorch.org/torchcodec/stable/generated_examples/) | [**Documentation**](https://meta-pytorch.org/torchcodec) | [**Contributing**](CONTRIBUTING.md) | [**License**](#license)
 
 # TorchCodec
 
@@ -23,7 +23,7 @@ We achieve these capabilities through:
 
 Here's a condensed summary of what you can do with TorchCodec. For more detailed
 examples, [check out our
-documentation](https://pytorch.org/torchcodec/stable/generated_examples/)!
+documentation](https://meta-pytorch.org/torchcodec/stable/generated_examples/)!
 
 #### Decoding
 
@@ -219,7 +219,7 @@ The bottom row is [promotional video from NASA](https://download.pytorch.org/tor
 that has a resolution of 960x540 at 29.7 fps and is 206 seconds long. Both videos were
 encoded with libx264 and yuv420p pixel format. All decoders, except for TorchVision, used FFmpeg 6.1.2. TorchVision used FFmpeg 4.2.2.
 
-For TorchCodec, the "approx" label means that it was using [approximate mode](https://pytorch.org/torchcodec/stable/generated_examples/approximate_mode.html)
+For TorchCodec, the "approx" label means that it was using [approximate mode](https://meta-pytorch.org/torchcodec/stable/generated_examples/decoding/approximate_mode.html)
 for seeking.
 
 ## Contributing
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -570,10 +570,10 @@ AVPixelFormat validatePixelFormat(
   TORCH_CHECK(false, errorMsg.str());
 }
 
-void validateDoubleOption(
+void tryToValidateCodecOption(
     const AVCodec& avCodec,
     const char* optionName,
-    double value) {
+    const std::string& value) {
   if (!avCodec.priv_class) {
     return;
   }
@@ -586,24 +586,60 @@ void validateDoubleOption(
       0,
       AV_OPT_SEARCH_FAKE_OBJ,
       nullptr);
-  // If the option was not found, let FFmpeg handle it later
+  // If option is not found we cannot validate it, let FFmpeg handle it
   if (!option) {
     return;
   }
+  // Validate if option is defined as a numeric type
   if (option->type == AV_OPT_TYPE_INT || option->type == AV_OPT_TYPE_INT64 ||
       option->type == AV_OPT_TYPE_FLOAT || option->type == AV_OPT_TYPE_DOUBLE) {
-    TORCH_CHECK(
-        value >= option->min && value <= option->max,
-        optionName,
-        "=",
-        value,
-        " is out of valid range [",
-        option->min,
-        ", ",
-        option->max,
-        "] for this codec. For more details, run 'ffmpeg -h encoder=",
-        avCodec.name,
-        "'");
+    try {
+      double numericValue = std::stod(value);
+      TORCH_CHECK(
+          numericValue >= option->min && numericValue <= option->max,
+          optionName,
+          "=",
+          numericValue,
+          " is out of valid range [",
+          option->min,
+          ", ",
+          option->max,
+          "] for this codec. For more details, run 'ffmpeg -h encoder=",
+          avCodec.name,
+          "'");
+    } catch (const std::invalid_argument& e) {
+      TORCH_CHECK(
+          false,
+          "Option ",
+          optionName,
+          " expects a numeric value but got '",
+          value,
+          "'");
+    }
+  }
+}
+
+void sortCodecOptions(
+    const std::map<std::string, std::string>& extraOptions,
+    AVDictionary** codecDict,
+    AVDictionary** formatDict) {
+  // Accepts a map of options as input, then sorts them into codec options and
+  // format options. The sorted options are returned into two separate dicts.
+  const AVClass* formatClass = avformat_get_class();
+  for (const auto& [key, value] : extraOptions) {
+    const AVOption* fmtOpt = av_opt_find2(
+        &formatClass,
+        key.c_str(),
+        nullptr,
+        0,
+        AV_OPT_SEARCH_CHILDREN | AV_OPT_SEARCH_FAKE_OBJ,
+        nullptr);
+    if (fmtOpt) {
+      av_dict_set(formatDict, key.c_str(), value.c_str(), 0);
+    } else {
+      // Default to codec option (includes AVCodecContext + encoder-private)
+      av_dict_set(codecDict, key.c_str(), value.c_str(), 0);
+    }
   }
 }
 } // namespace
@@ -621,6 +657,7 @@ VideoEncoder::~VideoEncoder() {
       avFormatContext_->pb = nullptr;
     }
   }
+  av_dict_free(&avFormatOptions_);
 }
 
 VideoEncoder::VideoEncoder(
@@ -760,21 +797,31 @@ void VideoEncoder::initializeEncoder(
   }
 
   // Apply videoStreamOptions
-  AVDictionary* options = nullptr;
+  AVDictionary* avCodecOptions = nullptr;
+  if (videoStreamOptions.extraOptions.has_value()) {
+    for (const auto& [key, value] : videoStreamOptions.extraOptions.value()) {
+      tryToValidateCodecOption(*avCodec, key.c_str(), value);
+    }
+    sortCodecOptions(
+        videoStreamOptions.extraOptions.value(),
+        &avCodecOptions,
+        &avFormatOptions_);
+  }
+
   if (videoStreamOptions.crf.has_value()) {
-    validateDoubleOption(*avCodec, "crf", videoStreamOptions.crf.value());
-    av_dict_set(
-        &options,
-        "crf",
-        std::to_string(videoStreamOptions.crf.value()).c_str(),
-        0);
+    std::string crfValue = std::to_string(videoStreamOptions.crf.value());
+    tryToValidateCodecOption(*avCodec, "crf", crfValue);
+    av_dict_set(&avCodecOptions, "crf", crfValue.c_str(), 0);
   }
   if (videoStreamOptions.preset.has_value()) {
     av_dict_set(
-        &options, "preset", videoStreamOptions.preset.value().c_str(), 0);
+        &avCodecOptions,
+        "preset",
+        videoStreamOptions.preset.value().c_str(),
+        0);
   }
-  int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
-  av_dict_free(&options);
+  int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
+  av_dict_free(&avCodecOptions);
 
   TORCH_CHECK(
       status == AVSUCCESS,
@@ -799,7 +846,7 @@ void VideoEncoder::encode() {
   TORCH_CHECK(!encodeWasCalled_, "Cannot call encode() twice.");
   encodeWasCalled_ = true;
 
-  int status = avformat_write_header(avFormatContext_.get(), nullptr);
+  int status = avformat_write_header(avFormatContext_.get(), &avFormatOptions_);
   TORCH_CHECK(
       status == AVSUCCESS,
       "Error in avformat_write_header: ",
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -1,9 +1,15 @@
 #pragma once
 #include <torch/types.h>
+#include <map>
+#include <string>
 #include "AVIOContextHolder.h"
 #include "FFMPEGCommon.h"
 #include "StreamOptions.h"
 
+extern "C" {
+#include <libavutil/dict.h>
+}
+
 namespace facebook::torchcodec {
 class AudioEncoder {
  public:
@@ -179,6 +185,7 @@ class VideoEncoder {
   std::unique_ptr<AVIOContextHolder> avioContextHolder_;
 
   bool encodeWasCalled_ = false;
+  AVDictionary* avFormatOptions_ = nullptr;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -296,6 +296,14 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
     streamMetadata.numFramesFromContent =
         streamInfos_[streamIndex].allFrames.size();
 
+    // This ensures that we are robust in handling cases where
+    // we are decoding in exact mode and numFrames is 0. The current metadata
+    // validation logic assumes that these values should not be None
+    if (streamMetadata.numFramesFromContent.value() == 0) {
+      streamMetadata.beginStreamPtsFromContent = 0;
+      streamMetadata.endStreamPtsFromContent = 0;
+    }
+
     if (streamMetadata.beginStreamPtsFromContent.has_value()) {
       streamMetadata.beginStreamPtsSecondsFromContent = ptsToSeconds(
           *streamMetadata.beginStreamPtsFromContent, avStream->time_base);
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include <torch/types.h>
+#include <map>
 #include <optional>
 #include <string>
 #include <string_view>
@@ -51,6 +52,7 @@ struct VideoStreamOptions {
   std::optional<std::string> pixelFormat;
   std::optional<double> crf;
   std::optional<std::string> preset;
+  std::optional<std::map<std::string, std::string>> extraOptions;
 };
 
 struct AudioStreamOptions {
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
-      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
+      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
   m.def(
-      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None) -> Tensor");
+      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> Tensor");
   m.def(
-      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
+      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def(
@@ -158,6 +158,16 @@ std::string quoteValue(const std::string& value) {
   return "\"" + value + "\"";
 }
 
+// Helper function to unflatten extra_options, alternating keys and values
+std::map<std::string, std::string> unflattenExtraOptions(
+    const std::vector<std::string>& opts) {
+  std::map<std::string, std::string> optionsMap;
+  for (size_t i = 0; i < opts.size(); i += 2) {
+    optionsMap[opts[i]] = opts[i + 1];
+  }
+  return optionsMap;
+}
+
 std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
   std::stringstream ss;
   ss << "{\n";
@@ -606,12 +616,19 @@ void encode_video_to_file(
     std::optional<std::string> codec = std::nullopt,
     std::optional<std::string_view> pixel_format = std::nullopt,
     std::optional<double> crf = std::nullopt,
-    std::optional<std::string_view> preset = std::nullopt) {
+    std::optional<std::string_view> preset = std::nullopt,
+    std::optional<std::vector<std::string>> extra_options = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.codec = codec;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
   videoStreamOptions.preset = preset;
+
+  if (extra_options.has_value()) {
+    videoStreamOptions.extraOptions =
+        unflattenExtraOptions(extra_options.value());
+  }
+
   VideoEncoder(
       frames,
       validateInt64ToInt(frame_rate, "frame_rate"),
@@ -627,13 +644,20 @@ at::Tensor encode_video_to_tensor(
     std::optional<std::string> codec = std::nullopt,
     std::optional<std::string_view> pixel_format = std::nullopt,
     std::optional<double> crf = std::nullopt,
-    std::optional<std::string_view> preset = std::nullopt) {
+    std::optional<std::string_view> preset = std::nullopt,
+    std::optional<std::vector<std::string>> extra_options = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.codec = codec;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
   videoStreamOptions.preset = preset;
+
+  if (extra_options.has_value()) {
+    videoStreamOptions.extraOptions =
+        unflattenExtraOptions(extra_options.value());
+  }
+
   return VideoEncoder(
              frames,
              validateInt64ToInt(frame_rate, "frame_rate"),
@@ -651,7 +675,8 @@ void _encode_video_to_file_like(
     std::optional<std::string> codec = std::nullopt,
     std::optional<std::string_view> pixel_format = std::nullopt,
     std::optional<double> crf = std::nullopt,
-    std::optional<std::string_view> preset = std::nullopt) {
+    std::optional<std::string_view> preset = std::nullopt,
+    std::optional<std::vector<std::string>> extra_options = std::nullopt) {
   auto fileLikeContext =
       reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
   TORCH_CHECK(
@@ -664,6 +689,11 @@ void _encode_video_to_file_like(
   videoStreamOptions.crf = crf;
   videoStreamOptions.preset = preset;
 
+  if (extra_options.has_value()) {
+    videoStreamOptions.extraOptions =
+        unflattenExtraOptions(extra_options.value());
+  }
+
   VideoEncoder encoder(
       frames,
       validateInt64ToInt(frame_rate, "frame_rate"),
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -217,6 +217,7 @@ def encode_video_to_file_like(
     pixel_format: Optional[str] = None,
     crf: Optional[Union[int, float]] = None,
     preset: Optional[str] = None,
+    extra_options: Optional[list[str]] = None,
 ) -> None:
     """Encode video frames to a file-like object.
 
@@ -229,6 +230,7 @@ def encode_video_to_file_like(
         pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
         crf: Optional constant rate factor for encoding quality
         preset: Optional encoder preset as string (e.g., "ultrafast", "medium")
+        extra_options: Optional list of extra options as flattened key-value pairs
     """
     assert _pybind_ops is not None
 
@@ -241,6 +243,7 @@ def encode_video_to_file_like(
         pixel_format,
         crf,
         preset,
+        extra_options,
     )
 
 
@@ -330,8 +333,9 @@ def encode_video_to_file_abstract(
     filename: str,
     codec: Optional[str],
     pixel_format: Optional[str] = None,
-    crf: Optional[Union[int, float]] = None,
     preset: Optional[str] = None,
+    crf: Optional[Union[int, float]] = None,
+    extra_options: Optional[list[str]] = None,
 ) -> None:
     return
 
@@ -343,8 +347,9 @@ def encode_video_to_tensor_abstract(
     format: str,
     codec: Optional[str],
     pixel_format: Optional[str] = None,
-    crf: Optional[Union[int, float]] = None,
     preset: Optional[str] = None,
+    crf: Optional[Union[int, float]] = None,
+    extra_options: Optional[list[str]] = None,
 ) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
 
@@ -357,8 +362,9 @@ def _encode_video_to_file_like_abstract(
     file_like_context: int,
     codec: Optional[str],
     pixel_format: Optional[str] = None,
-    crf: Optional[Union[int, float]] = None,
     preset: Optional[str] = None,
+    crf: Optional[Union[int, float]] = None,
+    extra_options: Optional[list[str]] = None,
 ) -> None:
     return
 
diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py
diff --git a/test/test_encoders.py b/test/test_encoders.py