meta-pytorch
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 73 additions & 26 deletions b/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 73 additions & 26 deletions
diff --git a/‎src/torchcodec/_core/Encoder.h‎
Lines changed: 7 additions & 0 deletions b/‎src/torchcodec/_core/Encoder.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 10 additions & 0 deletions b/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 1 addition & 0 deletions b/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/torchcodec/_core/Metadata.cpp‎
Lines changed: 6 additions & 3 deletions b/‎src/torchcodec/_core/Metadata.cpp‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/torchcodec/_core/Metadata.h‎
Lines changed: 12 additions & 1 deletion b/‎src/torchcodec/_core/Metadata.h‎
Lines changed: 12 additions & 1 deletion
@@ -1,4 +1,4 @@
-[**Installation**](#installing-torchcodec) | [**Simple Example**](#using-torchcodec) | [**Detailed Example**](https://pytorch.org/torchcodec/stable/generated_examples/) | [**Documentation**](https://pytorch.org/torchcodec) | [**Contributing**](CONTRIBUTING.md) | [**License**](#license)
+[**Installation**](#installing-torchcodec) | [**Simple Example**](#using-torchcodec) | [**Detailed Example**](https://meta-pytorch.org/torchcodec/stable/generated_examples/) | [**Documentation**](https://meta-pytorch.org/torchcodec) | [**Contributing**](CONTRIBUTING.md) | [**License**](#license)
 
 # TorchCodec
 
@@ -23,7 +23,7 @@ We achieve these capabilities through:
 
 Here's a condensed summary of what you can do with TorchCodec. For more detailed
 examples, [check out our
-documentation](https://pytorch.org/torchcodec/stable/generated_examples/)!
+documentation](https://meta-pytorch.org/torchcodec/stable/generated_examples/)!
 
 #### Decoding
 
@@ -219,7 +219,7 @@ The bottom row is [promotional video from NASA](https://download.pytorch.org/tor
 that has a resolution of 960x540 at 29.7 fps and is 206 seconds long. Both videos were
 encoded with libx264 and yuv420p pixel format. All decoders, except for TorchVision, used FFmpeg 6.1.2. TorchVision used FFmpeg 4.2.2.
 
-For TorchCodec, the "approx" label means that it was using [approximate mode](https://pytorch.org/torchcodec/stable/generated_examples/approximate_mode.html)
+For TorchCodec, the "approx" label means that it was using [approximate mode](https://meta-pytorch.org/torchcodec/stable/generated_examples/decoding/approximate_mode.html)
 for seeking.
 
 ## Contributing
 
@@ -570,10 +570,10 @@ AVPixelFormat validatePixelFormat(
   TORCH_CHECK(false, errorMsg.str());
 }
 
-void validateDoubleOption(
+void tryToValidateCodecOption(
     const AVCodec& avCodec,
     const char* optionName,
-    double value) {
+    const std::string& value) {
   if (!avCodec.priv_class) {
     return;
   }
@@ -586,24 +586,60 @@ void validateDoubleOption(
       0,
       AV_OPT_SEARCH_FAKE_OBJ,
       nullptr);
-  // If the option was not found, let FFmpeg handle it later
+  // If option is not found we cannot validate it, let FFmpeg handle it
   if (!option) {
     return;
   }
+  // Validate if option is defined as a numeric type
   if (option->type == AV_OPT_TYPE_INT || option->type == AV_OPT_TYPE_INT64 ||
       option->type == AV_OPT_TYPE_FLOAT || option->type == AV_OPT_TYPE_DOUBLE) {
-    TORCH_CHECK(
-        value >= option->min && value <= option->max,
-        optionName,
-        "=",
-        value,
-        " is out of valid range [",
-        option->min,
-        ", ",
-        option->max,
-        "] for this codec. For more details, run 'ffmpeg -h encoder=",
-        avCodec.name,
-        "'");
+    try {
+      double numericValue = std::stod(value);
+      TORCH_CHECK(
+          numericValue >= option->min && numericValue <= option->max,
+          optionName,
+          "=",
+          numericValue,
+          " is out of valid range [",
+          option->min,
+          ", ",
+          option->max,
+          "] for this codec. For more details, run 'ffmpeg -h encoder=",
+          avCodec.name,
+          "'");
+    } catch (const std::invalid_argument&) {
+      TORCH_CHECK(
+          false,
+          "Option ",
+          optionName,
+          " expects a numeric value but got '",
+          value,
+          "'");
+    }
+  }
+}
+
+void sortCodecOptions(
+    const std::map<std::string, std::string>& extraOptions,
+    AVDictionary** codecDict,
+    AVDictionary** formatDict) {
+  // Accepts a map of options as input, then sorts them into codec options and
+  // format options. The sorted options are returned into two separate dicts.
+  const AVClass* formatClass = avformat_get_class();
+  for (const auto& [key, value] : extraOptions) {
+    const AVOption* fmtOpt = av_opt_find2(
+        &formatClass,
+        key.c_str(),
+        nullptr,
+        0,
+        AV_OPT_SEARCH_CHILDREN | AV_OPT_SEARCH_FAKE_OBJ,
+        nullptr);
+    if (fmtOpt) {
+      av_dict_set(formatDict, key.c_str(), value.c_str(), 0);
+    } else {
+      // Default to codec option (includes AVCodecContext + encoder-private)
+      av_dict_set(codecDict, key.c_str(), value.c_str(), 0);
+    }
   }
 }
 } // namespace
@@ -621,6 +657,7 @@ VideoEncoder::~VideoEncoder() {
       avFormatContext_->pb = nullptr;
     }
   }
+  av_dict_free(&avFormatOptions_);
 }
 
 VideoEncoder::VideoEncoder(
@@ -760,21 +797,31 @@ void VideoEncoder::initializeEncoder(
   }
 
   // Apply videoStreamOptions
-  AVDictionary* options = nullptr;
+  AVDictionary* avCodecOptions = nullptr;
+  if (videoStreamOptions.extraOptions.has_value()) {
+    for (const auto& [key, value] : videoStreamOptions.extraOptions.value()) {
+      tryToValidateCodecOption(*avCodec, key.c_str(), value);
+    }
+    sortCodecOptions(
+        videoStreamOptions.extraOptions.value(),
+        &avCodecOptions,
+        &avFormatOptions_);
+  }
+
   if (videoStreamOptions.crf.has_value()) {
-    validateDoubleOption(*avCodec, "crf", videoStreamOptions.crf.value());
-    av_dict_set(
-        &options,
-        "crf",
-        std::to_string(videoStreamOptions.crf.value()).c_str(),
-        0);
+    std::string crfValue = std::to_string(videoStreamOptions.crf.value());
+    tryToValidateCodecOption(*avCodec, "crf", crfValue);
+    av_dict_set(&avCodecOptions, "crf", crfValue.c_str(), 0);
   }
   if (videoStreamOptions.preset.has_value()) {
     av_dict_set(
-        &options, "preset", videoStreamOptions.preset.value().c_str(), 0);
+        &avCodecOptions,
+        "preset",
+        videoStreamOptions.preset.value().c_str(),
+        0);
   }
-  int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
-  av_dict_free(&options);
+  int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
+  av_dict_free(&avCodecOptions);
 
   TORCH_CHECK(
       status == AVSUCCESS,
@@ -799,7 +846,7 @@ void VideoEncoder::encode() {
   TORCH_CHECK(!encodeWasCalled_, "Cannot call encode() twice.");
   encodeWasCalled_ = true;
 
-  int status = avformat_write_header(avFormatContext_.get(), nullptr);
+  int status = avformat_write_header(avFormatContext_.get(), &avFormatOptions_);
   TORCH_CHECK(
       status == AVSUCCESS,
       "Error in avformat_write_header: ",
 
@@ -1,9 +1,15 @@
 #pragma once
 #include <torch/types.h>
+#include <map>
+#include <string>
 #include "AVIOContextHolder.h"
 #include "FFMPEGCommon.h"
 #include "StreamOptions.h"
 
+extern "C" {
+#include <libavutil/dict.h>
+}
+
 namespace facebook::torchcodec {
 class AudioEncoder {
  public:
@@ -179,6 +185,7 @@ class VideoEncoder {
   std::unique_ptr<AVIOContextHolder> avioContextHolder_;
 
   bool encodeWasCalled_ = false;
+  AVDictionary* avFormatOptions_ = nullptr;
 };
 
 } // namespace facebook::torchcodec
@@ -158,6 +158,16 @@ int getNumChannels(const SharedAVCodecContext& avCodecContext) {
 #endif
 }
 
+int getNumChannels(const AVCodecParameters* codecpar) {
+  TORCH_CHECK(codecpar != nullptr, "codecpar is null")
+#if LIBAVFILTER_VERSION_MAJOR > 8 || \
+    (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
+  return codecpar->ch_layout.nb_channels;
+#else
+  return codecpar->channels;
+#endif
+}
+
 void setDefaultChannelLayout(
     UniqueAVCodecContext& avCodecContext,
     int numChannels) {
 
@@ -180,6 +180,7 @@ const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec);
 
 int getNumChannels(const UniqueAVFrame& avFrame);
 int getNumChannels(const SharedAVCodecContext& avCodecContext);
+int getNumChannels(const AVCodecParameters* codecpar);
 
 void setDefaultChannelLayout(
     UniqueAVCodecContext& avCodecContext,
 
@@ -29,6 +29,9 @@ std::optional<double> StreamMetadata::getDurationSeconds(
         return static_cast<double>(numFramesFromHeader.value()) /
             averageFpsFromHeader.value();
       }
+      if (durationSecondsFromContainer.has_value()) {
+        return durationSecondsFromContainer.value();
+      }
       return std::nullopt;
     default:
       TORCH_CHECK(false, "Unknown SeekMode");
@@ -80,13 +83,13 @@ std::optional<int64_t> StreamMetadata::getNumFrames(SeekMode seekMode) const {
           numFramesFromContent.has_value(), "Missing numFramesFromContent");
       return numFramesFromContent.value();
     case SeekMode::approximate: {
+      auto durationSeconds = getDurationSeconds(seekMode);
       if (numFramesFromHeader.has_value()) {
         return numFramesFromHeader.value();
       }
-      if (averageFpsFromHeader.has_value() &&
-          durationSecondsFromHeader.has_value()) {
+      if (averageFpsFromHeader.has_value() && durationSeconds.has_value()) {
         return static_cast<int64_t>(
-            averageFpsFromHeader.value() * durationSecondsFromHeader.value());
+            averageFpsFromHeader.value() * durationSeconds.value());
       }
       return std::nullopt;
     }
 
@@ -23,9 +23,11 @@ enum class SeekMode { exact, approximate, custom_frame_mappings };
 struct StreamMetadata {
   // Common (video and audio) fields derived from the AVStream.
   int streamIndex;
+
   // See this link for what various values are available:
   // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
   AVMediaType mediaType;
+
   std::optional<AVCodecID> codecId;
   std::optional<std::string> codecName;
   std::optional<double> durationSecondsFromHeader;
@@ -35,17 +37,22 @@ struct StreamMetadata {
   std::optional<double> averageFpsFromHeader;
   std::optional<double> bitRate;
 
+  // Used as fallback in approximate mode when stream duration is unavailable.
+  std::optional<double> durationSecondsFromContainer;
+
   // More accurate duration, obtained by scanning the file.
   // These presentation timestamps are in time base.
   std::optional<int64_t> beginStreamPtsFromContent;
   std::optional<int64_t> endStreamPtsFromContent;
+
   // These presentation timestamps are in seconds.
   std::optional<double> beginStreamPtsSecondsFromContent;
   std::optional<double> endStreamPtsSecondsFromContent;
+
   // This can be useful for index-based seeking.
   std::optional<int64_t> numFramesFromContent;
 
-  // Video-only fields derived from the AVCodecContext.
+  // Video-only fields
   std::optional<int> width;
   std::optional<int> height;
   std::optional<AVRational> sampleAspectRatio;
@@ -67,13 +74,17 @@ struct ContainerMetadata {
   std::vector<StreamMetadata> allStreamMetadata;
   int numAudioStreams = 0;
   int numVideoStreams = 0;
+
   // Note that this is the container-level duration, which is usually the max
   // of all stream durations available in the container.
   std::optional<double> durationSecondsFromHeader;
+
   // Total BitRate level information at the container level in bit/s
   std::optional<double> bitRate;
+
   // If set, this is the index to the default audio stream.
   std::optional<int> bestAudioStreamIndex;
+
   // If set, this is the index to the default video stream.
   std::optional<int> bestVideoStreamIndex;
 };