meta-pytorch
diff --git a/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 10 additions & 6 deletions b/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎src/torchcodec/_core/Encoder.h‎
Lines changed: 3 additions & 3 deletions b/‎src/torchcodec/_core/Encoder.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 10 additions & 0 deletions b/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 1 addition & 0 deletions b/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/torchcodec/_core/Metadata.cpp‎
Lines changed: 6 additions & 3 deletions b/‎src/torchcodec/_core/Metadata.cpp‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/torchcodec/_core/Metadata.h‎
Lines changed: 12 additions & 1 deletion b/‎src/torchcodec/_core/Metadata.h‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎src/torchcodec/_core/SingleStreamDecoder.cpp‎
Lines changed: 40 additions & 33 deletions b/‎src/torchcodec/_core/SingleStreamDecoder.cpp‎
Lines changed: 40 additions & 33 deletions
diff --git a/‎src/torchcodec/_core/_metadata.py‎
Lines changed: 2 additions & 1 deletion b/‎src/torchcodec/_core/_metadata.py‎
Lines changed: 2 additions & 1 deletion
@@ -607,7 +607,7 @@ void tryToValidateCodecOption(
           "] for this codec. For more details, run 'ffmpeg -h encoder=",
           avCodec.name,
           "'");
-    } catch (const std::invalid_argument& e) {
+    } catch (const std::invalid_argument&) {
       TORCH_CHECK(
           false,
           "Option ",
@@ -662,7 +662,7 @@ VideoEncoder::~VideoEncoder() {
 
 VideoEncoder::VideoEncoder(
     const torch::Tensor& frames,
-    int frameRate,
+    double frameRate,
     std::string_view fileName,
     const VideoStreamOptions& videoStreamOptions)
     : frames_(validateFrames(frames)), inFrameRate_(frameRate) {
@@ -694,7 +694,7 @@ VideoEncoder::VideoEncoder(
 
 VideoEncoder::VideoEncoder(
     const torch::Tensor& frames,
-    int frameRate,
+    double frameRate,
     std::string_view formatName,
     std::unique_ptr<AVIOContextHolder> avioContextHolder,
     const VideoStreamOptions& videoStreamOptions)
@@ -787,9 +787,9 @@ void VideoEncoder::initializeEncoder(
   avCodecContext_->width = outWidth_;
   avCodecContext_->height = outHeight_;
   avCodecContext_->pix_fmt = outPixelFormat_;
-  // TODO-VideoEncoder: Verify that frame_rate and time_base are correct
-  avCodecContext_->time_base = {1, inFrameRate_};
-  avCodecContext_->framerate = {inFrameRate_, 1};
+  // TODO-VideoEncoder: Add and utilize output frame_rate option
+  avCodecContext_->framerate = av_d2q(inFrameRate_, INT_MAX);
+  avCodecContext_->time_base = av_inv_q(avCodecContext_->framerate);
 
   // Set flag for containers that require extradata to be in the codec context
   if (avFormatContext_->oformat->flags & AVFMT_GLOBALHEADER) {
@@ -833,6 +833,10 @@ void VideoEncoder::initializeEncoder(
 
   // Set the stream time base to encode correct frame timestamps
   avStream_->time_base = avCodecContext_->time_base;
+  // Set the stream frame rate to store correct frame durations for some
+  // containers (webm, mkv)
+  avStream_->r_frame_rate = avCodecContext_->framerate;
+
   status = avcodec_parameters_from_context(
       avStream_->codecpar, avCodecContext_.get());
   TORCH_CHECK(
 
@@ -143,13 +143,13 @@ class VideoEncoder {
 
   VideoEncoder(
       const torch::Tensor& frames,
-      int frameRate,
+      double frameRate,
       std::string_view fileName,
       const VideoStreamOptions& videoStreamOptions);
 
   VideoEncoder(
       const torch::Tensor& frames,
-      int frameRate,
+      double frameRate,
       std::string_view formatName,
       std::unique_ptr<AVIOContextHolder> avioContextHolder,
       const VideoStreamOptions& videoStreamOptions);
@@ -172,7 +172,7 @@ class VideoEncoder {
   UniqueSwsContext swsContext_;
 
   const torch::Tensor frames_;
-  int inFrameRate_;
+  double inFrameRate_;
 
   int inWidth_ = -1;
   int inHeight_ = -1;
 
@@ -158,6 +158,16 @@ int getNumChannels(const SharedAVCodecContext& avCodecContext) {
 #endif
 }
 
+int getNumChannels(const AVCodecParameters* codecpar) {
+  TORCH_CHECK(codecpar != nullptr, "codecpar is null")
+#if LIBAVFILTER_VERSION_MAJOR > 8 || \
+    (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
+  return codecpar->ch_layout.nb_channels;
+#else
+  return codecpar->channels;
+#endif
+}
+
 void setDefaultChannelLayout(
     UniqueAVCodecContext& avCodecContext,
     int numChannels) {
 
@@ -180,6 +180,7 @@ const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec);
 
 int getNumChannels(const UniqueAVFrame& avFrame);
 int getNumChannels(const SharedAVCodecContext& avCodecContext);
+int getNumChannels(const AVCodecParameters* codecpar);
 
 void setDefaultChannelLayout(
     UniqueAVCodecContext& avCodecContext,
 
@@ -29,6 +29,9 @@ std::optional<double> StreamMetadata::getDurationSeconds(
         return static_cast<double>(numFramesFromHeader.value()) /
             averageFpsFromHeader.value();
       }
+      if (durationSecondsFromContainer.has_value()) {
+        return durationSecondsFromContainer.value();
+      }
       return std::nullopt;
     default:
       TORCH_CHECK(false, "Unknown SeekMode");
@@ -80,13 +83,13 @@ std::optional<int64_t> StreamMetadata::getNumFrames(SeekMode seekMode) const {
           numFramesFromContent.has_value(), "Missing numFramesFromContent");
       return numFramesFromContent.value();
     case SeekMode::approximate: {
+      auto durationSeconds = getDurationSeconds(seekMode);
       if (numFramesFromHeader.has_value()) {
         return numFramesFromHeader.value();
       }
-      if (averageFpsFromHeader.has_value() &&
-          durationSecondsFromHeader.has_value()) {
+      if (averageFpsFromHeader.has_value() && durationSeconds.has_value()) {
         return static_cast<int64_t>(
-            averageFpsFromHeader.value() * durationSecondsFromHeader.value());
+            averageFpsFromHeader.value() * durationSeconds.value());
       }
       return std::nullopt;
     }
 
@@ -23,9 +23,11 @@ enum class SeekMode { exact, approximate, custom_frame_mappings };
 struct StreamMetadata {
   // Common (video and audio) fields derived from the AVStream.
   int streamIndex;
+
   // See this link for what various values are available:
   // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
   AVMediaType mediaType;
+
   std::optional<AVCodecID> codecId;
   std::optional<std::string> codecName;
   std::optional<double> durationSecondsFromHeader;
@@ -35,17 +37,22 @@ struct StreamMetadata {
   std::optional<double> averageFpsFromHeader;
   std::optional<double> bitRate;
 
+  // Used as fallback in approximate mode when stream duration is unavailable.
+  std::optional<double> durationSecondsFromContainer;
+
   // More accurate duration, obtained by scanning the file.
   // These presentation timestamps are in time base.
   std::optional<int64_t> beginStreamPtsFromContent;
   std::optional<int64_t> endStreamPtsFromContent;
+
   // These presentation timestamps are in seconds.
   std::optional<double> beginStreamPtsSecondsFromContent;
   std::optional<double> endStreamPtsSecondsFromContent;
+
   // This can be useful for index-based seeking.
   std::optional<int64_t> numFramesFromContent;
 
-  // Video-only fields derived from the AVCodecContext.
+  // Video-only fields
   std::optional<int> width;
   std::optional<int> height;
   std::optional<AVRational> sampleAspectRatio;
@@ -67,13 +74,17 @@ struct ContainerMetadata {
   std::vector<StreamMetadata> allStreamMetadata;
   int numAudioStreams = 0;
   int numVideoStreams = 0;
+
   // Note that this is the container-level duration, which is usually the max
   // of all stream durations available in the container.
   std::optional<double> durationSecondsFromHeader;
+
   // Total BitRate level information at the container level in bit/s
   std::optional<double> bitRate;
+
   // If set, this is the index to the default audio stream.
   std::optional<int> bestAudioStreamIndex;
+
   // If set, this is the index to the default video stream.
   std::optional<int> bestVideoStreamIndex;
 };
 
@@ -100,6 +100,26 @@ void SingleStreamDecoder::initializeDecoder() {
       "Failed to find stream info: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
+  if (formatContext_->duration > 0) {
+    AVRational defaultTimeBase{1, AV_TIME_BASE};
+    containerMetadata_.durationSecondsFromHeader =
+        ptsToSeconds(formatContext_->duration, defaultTimeBase);
+  }
+
+  if (formatContext_->bit_rate > 0) {
+    containerMetadata_.bitRate = formatContext_->bit_rate;
+  }
+
+  int bestVideoStream = getBestStreamIndex(AVMEDIA_TYPE_VIDEO);
+  if (bestVideoStream >= 0) {
+    containerMetadata_.bestVideoStreamIndex = bestVideoStream;
+  }
+
+  int bestAudioStream = getBestStreamIndex(AVMEDIA_TYPE_AUDIO);
+  if (bestAudioStream >= 0) {
+    containerMetadata_.bestAudioStreamIndex = bestAudioStream;
+  }
+
   for (unsigned int i = 0; i < formatContext_->nb_streams; i++) {
     AVStream* avStream = formatContext_->streams[i];
     StreamMetadata streamMetadata;
@@ -110,8 +130,8 @@ void SingleStreamDecoder::initializeDecoder() {
             ", does not match AVStream's index, " +
             std::to_string(avStream->index) + ".");
     streamMetadata.streamIndex = i;
-    streamMetadata.mediaType = avStream->codecpar->codec_type;
     streamMetadata.codecName = avcodec_get_name(avStream->codecpar->codec_id);
+    streamMetadata.mediaType = avStream->codecpar->codec_type;
     streamMetadata.bitRate = avStream->codecpar->bit_rate;
 
     int64_t frameCount = avStream->nb_frames;
@@ -133,10 +153,18 @@ void SingleStreamDecoder::initializeDecoder() {
       if (fps > 0) {
         streamMetadata.averageFpsFromHeader = fps;
       }
+      streamMetadata.width = avStream->codecpar->width;
+      streamMetadata.height = avStream->codecpar->height;
+      streamMetadata.sampleAspectRatio =
+          avStream->codecpar->sample_aspect_ratio;
       containerMetadata_.numVideoStreams++;
     } else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
       AVSampleFormat format =
           static_cast<AVSampleFormat>(avStream->codecpar->format);
+      streamMetadata.sampleRate =
+          static_cast<int64_t>(avStream->codecpar->sample_rate);
+      streamMetadata.numChannels =
+          static_cast<int64_t>(getNumChannels(avStream->codecpar));
 
       // If the AVSampleFormat is not recognized, we get back nullptr. We have
       // to make sure we don't initialize a std::string with nullptr. There's
@@ -149,27 +177,10 @@ void SingleStreamDecoder::initializeDecoder() {
       containerMetadata_.numAudioStreams++;
     }
 
-    containerMetadata_.allStreamMetadata.push_back(streamMetadata);
-  }
+    streamMetadata.durationSecondsFromContainer =
+        containerMetadata_.durationSecondsFromHeader;
 
-  if (formatContext_->duration > 0) {
-    AVRational defaultTimeBase{1, AV_TIME_BASE};
-    containerMetadata_.durationSecondsFromHeader =
-        ptsToSeconds(formatContext_->duration, defaultTimeBase);
-  }
-
-  if (formatContext_->bit_rate > 0) {
-    containerMetadata_.bitRate = formatContext_->bit_rate;
-  }
-
-  int bestVideoStream = getBestStreamIndex(AVMEDIA_TYPE_VIDEO);
-  if (bestVideoStream >= 0) {
-    containerMetadata_.bestVideoStreamIndex = bestVideoStream;
-  }
-
-  int bestAudioStream = getBestStreamIndex(AVMEDIA_TYPE_AUDIO);
-  if (bestAudioStream >= 0) {
-    containerMetadata_.bestAudioStreamIndex = bestAudioStream;
+    containerMetadata_.allStreamMetadata.push_back(streamMetadata);
   }
 
   if (seekMode_ == SeekMode::exact) {
@@ -288,6 +299,14 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
     streamMetadata.numFramesFromContent =
         streamInfos_[streamIndex].allFrames.size();
 
+    // This ensures that we are robust in handling cases where
+    // we are decoding in exact mode and numFrames is 0. The current metadata
+    // validation logic assumes that these values should not be None
+    if (streamMetadata.numFramesFromContent.value() == 0) {
+      streamMetadata.beginStreamPtsFromContent = 0;
+      streamMetadata.endStreamPtsFromContent = 0;
+    }
+
     if (streamMetadata.beginStreamPtsFromContent.has_value()) {
       streamMetadata.beginStreamPtsSecondsFromContent = ptsToSeconds(
           *streamMetadata.beginStreamPtsFromContent, avStream->time_base);
@@ -516,11 +535,6 @@ void SingleStreamDecoder::addVideoStream(
   auto& streamInfo = streamInfos_[activeStreamIndex_];
   streamInfo.videoStreamOptions = videoStreamOptions;
 
-  streamMetadata.width = streamInfo.codecContext->width;
-  streamMetadata.height = streamInfo.codecContext->height;
-  streamMetadata.sampleAspectRatio =
-      streamInfo.codecContext->sample_aspect_ratio;
-
   if (seekMode_ == SeekMode::custom_frame_mappings) {
     TORCH_CHECK(
         customFrameMappings.has_value(),
@@ -566,13 +580,6 @@ void SingleStreamDecoder::addAudioStream(
   auto& streamInfo = streamInfos_[activeStreamIndex_];
   streamInfo.audioStreamOptions = audioStreamOptions;
 
-  auto& streamMetadata =
-      containerMetadata_.allStreamMetadata[activeStreamIndex_];
-  streamMetadata.sampleRate =
-      static_cast<int64_t>(streamInfo.codecContext->sample_rate);
-  streamMetadata.numChannels =
-      static_cast<int64_t>(getNumChannels(streamInfo.codecContext));
-
   // FFmpeg docs say that the decoder will try to decode natively in this
   // format, if it can. Docs don't say what the decoder does when it doesn't
   // support that format, but it looks like it does nothing, so this probably
 
@@ -44,7 +44,8 @@ class StreamMetadata:
     from the actual frames if a :term:`scan` was performed. Otherwise we
     fall back to ``duration_seconds_from_header``. If that value is also None,
     we instead calculate the duration from ``num_frames_from_header`` and
-    ``average_fps_from_header``.
+    ``average_fps_from_header``. If all of those are unavailable, we fall back
+    to the container-level ``duration_seconds_from_header``.
     """
     begin_stream_seconds: Optional[float]
     """Beginning of the stream, in seconds (float). Conceptually, this