meta-pytorch
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 75 additions & 91 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 75 additions & 91 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.h‎
Lines changed: 19 additions & 10 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.h‎
Lines changed: 19 additions & 10 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoderOps.cpp‎
Lines changed: 10 additions & 0 deletions b/‎src/torchcodec/decoders/_core/VideoDecoderOps.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoderOps.h‎
Lines changed: 2 additions & 0 deletions b/‎src/torchcodec/decoders/_core/VideoDecoderOps.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/torchcodec/decoders/_core/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/torchcodec/decoders/_core/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -435,11 +435,9 @@ int VideoDecoder::getBestStreamIndex(AVMediaType mediaType) {
 void VideoDecoder::addVideoStreamDecoder(
     int preferredStreamIndex,
     const VideoStreamOptions& videoStreamOptions) {
-  if (activeStreamIndices_.count(preferredStreamIndex) > 0) {
-    throw std::invalid_argument(
-        "Stream with index " + std::to_string(preferredStreamIndex) +
-        " is already active.");
-  }
+  TORCH_CHECK(
+      activeStreamIndex_ == NO_ACTIVE_STREAM,
+      "Can only add one single stream.");
   TORCH_CHECK(formatContext_.get() != nullptr);
 
   AVCodecOnlyUseForCallingAVFindBestStream avCodec = nullptr;
@@ -506,7 +504,7 @@ void VideoDecoder::addVideoStreamDecoder(
   }
 
   codecContext->time_base = streamInfo.stream->time_base;
-  activeStreamIndices_.insert(streamIndex);
+  activeStreamIndex_ = streamIndex;
   updateMetadataWithCodecContext(streamInfo.streamIndex, codecContext);
   streamInfo.videoStreamOptions = videoStreamOptions;
 
@@ -538,6 +536,20 @@ VideoDecoder::ContainerMetadata VideoDecoder::getContainerMetadata() const {
   return containerMetadata_;
 }
 
+torch::Tensor VideoDecoder::getKeyFrameIndices(int streamIndex) {
+  validateUserProvidedStreamIndex(streamIndex);
+  validateScannedAllStreams("getKeyFrameIndices");
+
+  const std::vector<FrameInfo>& keyFrames = streamInfos_[streamIndex].keyFrames;
+  torch::Tensor keyFrameIndices =
+      torch::empty({static_cast<int64_t>(keyFrames.size())}, {torch::kInt64});
+  for (size_t i = 0; i < keyFrames.size(); ++i) {
+    keyFrameIndices[i] = keyFrames[i].frameIndex;
+  }
+
+  return keyFrameIndices;
+}
+
 int VideoDecoder::getKeyFrameIndexForPtsUsingEncoderIndex(
     AVStream* stream,
     int64_t pts) const {
@@ -654,7 +666,21 @@ void VideoDecoder::scanFileAndUpdateMetadataAndIndex() {
           return frameInfo1.pts < frameInfo2.pts;
         });
 
+    size_t keyIndex = 0;
     for (size_t i = 0; i < streamInfo.allFrames.size(); ++i) {
+      streamInfo.allFrames[i].frameIndex = i;
+
+      // For correctly encoded files, we shouldn't need to ensure that keyIndex
+      // is less than the number of key frames. That is, the relationship
+      // between the frames in allFrames and keyFrames should be such that
+      // keyIndex is always a valid index into keyFrames. But we're being
+      // defensive in case we encounter incorrectly encoded files.
+      if (keyIndex < streamInfo.keyFrames.size() &&
+          streamInfo.keyFrames[keyIndex].pts == streamInfo.allFrames[i].pts) {
+        streamInfo.keyFrames[keyIndex].frameIndex = i;
+        ++keyIndex;
+      }
+
       if (i + 1 < streamInfo.allFrames.size()) {
         streamInfo.allFrames[i].nextPts = streamInfo.allFrames[i + 1].pts;
       }
@@ -726,53 +752,39 @@ bool VideoDecoder::canWeAvoidSeekingForStream(
 // AVFormatContext if it is needed. We can skip seeking in certain cases. See
 // the comment of canWeAvoidSeeking() for details.
 void VideoDecoder::maybeSeekToBeforeDesiredPts() {
-  if (activeStreamIndices_.size() == 0) {
+  if (activeStreamIndex_ == NO_ACTIVE_STREAM) {
     return;
   }
-  for (int streamIndex : activeStreamIndices_) {
-    StreamInfo& streamInfo = streamInfos_[streamIndex];
-    // clang-format off: clang format clashes
-    streamInfo.discardFramesBeforePts = secondsToClosestPts(*desiredPtsSeconds_, streamInfo.timeBase);
-    // clang-format on
-  }
+  StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
+  streamInfo.discardFramesBeforePts =
+      secondsToClosestPts(*desiredPtsSeconds_, streamInfo.timeBase);
 
   decodeStats_.numSeeksAttempted++;
-  // See comment for canWeAvoidSeeking() for details on why this optimization
-  // works.
-  bool mustSeek = false;
-  for (int streamIndex : activeStreamIndices_) {
-    StreamInfo& streamInfo = streamInfos_[streamIndex];
-    int64_t desiredPtsForStream = *desiredPtsSeconds_ * streamInfo.timeBase.den;
-    if (!canWeAvoidSeekingForStream(
-            streamInfo, streamInfo.currentPts, desiredPtsForStream)) {
-      mustSeek = true;
-      break;
-    }
-  }
-  if (!mustSeek) {
+
+  int64_t desiredPtsForStream = *desiredPtsSeconds_ * streamInfo.timeBase.den;
+  if (canWeAvoidSeekingForStream(
+          streamInfo, streamInfo.currentPts, desiredPtsForStream)) {
     decodeStats_.numSeeksSkipped++;
     return;
   }
-  int firstActiveStreamIndex = *activeStreamIndices_.begin();
-  const auto& firstStreamInfo = streamInfos_[firstActiveStreamIndex];
   int64_t desiredPts =
-      secondsToClosestPts(*desiredPtsSeconds_, firstStreamInfo.timeBase);
+      secondsToClosestPts(*desiredPtsSeconds_, streamInfo.timeBase);
 
   // For some encodings like H265, FFMPEG sometimes seeks past the point we
   // set as the max_ts. So we use our own index to give it the exact pts of
   // the key frame that we want to seek to.
   // See https://github.com/pytorch/torchcodec/issues/179 for more details.
   // See https://trac.ffmpeg.org/ticket/11137 for the underlying ffmpeg bug.
-  if (!firstStreamInfo.keyFrames.empty()) {
+  if (!streamInfo.keyFrames.empty()) {
     int desiredKeyFrameIndex = getKeyFrameIndexForPtsUsingScannedIndex(
-        firstStreamInfo.keyFrames, desiredPts);
+        streamInfo.keyFrames, desiredPts);
     desiredKeyFrameIndex = std::max(desiredKeyFrameIndex, 0);
-    desiredPts = firstStreamInfo.keyFrames[desiredKeyFrameIndex].pts;
+    desiredPts = streamInfo.keyFrames[desiredKeyFrameIndex].pts;
   }
 
   int ffmepgStatus = avformat_seek_file(
       formatContext_.get(),
-      firstStreamInfo.streamIndex,
+      streamInfo.streamIndex,
       INT64_MIN,
       desiredPts,
       desiredPts,
@@ -783,15 +795,12 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
         getFFMPEGErrorStringFromErrorCode(ffmepgStatus));
   }
   decodeStats_.numFlushes++;
-  for (int streamIndex : activeStreamIndices_) {
-    StreamInfo& streamInfo = streamInfos_[streamIndex];
-    avcodec_flush_buffers(streamInfo.codecContext.get());
-  }
+  avcodec_flush_buffers(streamInfo.codecContext.get());
 }
 
 VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
-    std::function<bool(int, AVFrame*)> filterFunction) {
-  if (activeStreamIndices_.size() == 0) {
+    std::function<bool(AVFrame*)> filterFunction) {
+  if (activeStreamIndex_ == NO_ACTIVE_STREAM) {
     throw std::runtime_error("No active streams configured.");
   }
 
@@ -803,44 +812,25 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
     desiredPtsSeconds_ = std::nullopt;
   }
 
+  StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
+
   // Need to get the next frame or error from PopFrame.
   UniqueAVFrame avFrame(av_frame_alloc());
   AutoAVPacket autoAVPacket;
   int ffmpegStatus = AVSUCCESS;
   bool reachedEOF = false;
-  int frameStreamIndex = -1;
   while (true) {
-    frameStreamIndex = -1;
-    bool gotPermanentErrorOnAnyActiveStream = false;
-
-    // Get a frame on an active stream. Note that we don't know ahead of time
-    // which streams have frames to receive, so we linearly try the active
-    // streams.
-    for (int streamIndex : activeStreamIndices_) {
-      StreamInfo& streamInfo = streamInfos_[streamIndex];
-      ffmpegStatus =
-          avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
-
-      if (ffmpegStatus != AVSUCCESS && ffmpegStatus != AVERROR(EAGAIN)) {
-        gotPermanentErrorOnAnyActiveStream = true;
-        break;
-      }
+    ffmpegStatus =
+        avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
 
-      if (ffmpegStatus == AVSUCCESS) {
-        frameStreamIndex = streamIndex;
-        break;
-      }
-    }
-
-    if (gotPermanentErrorOnAnyActiveStream) {
+    if (ffmpegStatus != AVSUCCESS && ffmpegStatus != AVERROR(EAGAIN)) {
+      // Non-retriable error
       break;
     }
 
     decodeStats_.numFramesReceivedByDecoder++;
-
     // Is this the kind of frame we're looking for?
-    if (ffmpegStatus == AVSUCCESS &&
-        filterFunction(frameStreamIndex, avFrame.get())) {
+    if (ffmpegStatus == AVSUCCESS && filterFunction(avFrame.get())) {
       // Yes, this is the frame we'll return; break out of the decoding loop.
       break;
     } else if (ffmpegStatus == AVSUCCESS) {
@@ -865,18 +855,15 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
     decodeStats_.numPacketsRead++;
 
     if (ffmpegStatus == AVERROR_EOF) {
-      // End of file reached. We must drain all codecs by sending a nullptr
+      // End of file reached. We must drain the codec by sending a nullptr
       // packet.
-      for (int streamIndex : activeStreamIndices_) {
-        StreamInfo& streamInfo = streamInfos_[streamIndex];
-        ffmpegStatus = avcodec_send_packet(
-            streamInfo.codecContext.get(),
-            /*avpkt=*/nullptr);
-        if (ffmpegStatus < AVSUCCESS) {
-          throw std::runtime_error(
-              "Could not flush decoder: " +
-              getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
-        }
+      ffmpegStatus = avcodec_send_packet(
+          streamInfo.codecContext.get(),
+          /*avpkt=*/nullptr);
+      if (ffmpegStatus < AVSUCCESS) {
+        throw std::runtime_error(
+            "Could not flush decoder: " +
+            getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
       }
 
       // We've reached the end of file so we can't read any more packets from
@@ -892,15 +879,14 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
           getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
     }
 
-    if (activeStreamIndices_.count(packet->stream_index) == 0) {
-      // This packet is not for any of the active streams.
+    if (packet->stream_index != activeStreamIndex_) {
       continue;
     }
 
     // We got a valid packet. Send it to the decoder, and we'll receive it in
     // the next iteration.
-    ffmpegStatus = avcodec_send_packet(
-        streamInfos_[packet->stream_index].codecContext.get(), packet.get());
+    ffmpegStatus =
+        avcodec_send_packet(streamInfo.codecContext.get(), packet.get());
     if (ffmpegStatus < AVSUCCESS) {
       throw std::runtime_error(
           "Could not push packet to decoder: " +
@@ -927,11 +913,10 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
   // haven't received as frames. Eventually we will either hit AVERROR_EOF from
   // av_receive_frame() or the user will have seeked to a different location in
   // the file and that will flush the decoder.
-  StreamInfo& activeStreamInfo = streamInfos_[frameStreamIndex];
-  activeStreamInfo.currentPts = avFrame->pts;
-  activeStreamInfo.currentDuration = getDuration(avFrame);
+  streamInfo.currentPts = avFrame->pts;
+  streamInfo.currentDuration = getDuration(avFrame);
 
-  return AVFrameStream(std::move(avFrame), frameStreamIndex);
+  return AVFrameStream(std::move(avFrame), activeStreamIndex_);
 }
 
 VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
@@ -1096,8 +1081,8 @@ VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAtNoDemux(
 
   setCursorPtsInSeconds(seconds);
   AVFrameStream avFrameStream =
-      decodeAVFrame([seconds, this](int frameStreamIndex, AVFrame* avFrame) {
-        StreamInfo& streamInfo = streamInfos_[frameStreamIndex];
+      decodeAVFrame([seconds, this](AVFrame* avFrame) {
+        StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
         double frameStartTime = ptsToSeconds(avFrame->pts, streamInfo.timeBase);
         double frameEndTime = ptsToSeconds(
             avFrame->pts + getDuration(avFrame), streamInfo.timeBase);
@@ -1496,11 +1481,10 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrameNoDemux() {
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrameNoDemuxInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  AVFrameStream avFrameStream =
-      decodeAVFrame([this](int frameStreamIndex, AVFrame* avFrame) {
-        StreamInfo& activeStreamInfo = streamInfos_[frameStreamIndex];
-        return avFrame->pts >= activeStreamInfo.discardFramesBeforePts;
-      });
+  AVFrameStream avFrameStream = decodeAVFrame([this](AVFrame* avFrame) {
+    StreamInfo& activeStreamInfo = streamInfos_[activeStreamIndex_];
+    return avFrame->pts >= activeStreamInfo.discardFramesBeforePts;
+  });
   return convertAVFrameToFrameOutput(avFrameStream, preAllocatedOutputTensor);
 }
 
 
@@ -97,6 +97,10 @@ class VideoDecoder {
   // Returns the metadata for the container.
   ContainerMetadata getContainerMetadata() const;
 
+  // Returns the key frame indices as a tensor. The tensor is 1D and contains
+  // int64 values, where each value is the frame index for a key frame.
+  torch::Tensor getKeyFrameIndices(int streamIndex);
+
   // --------------------------------------------------------------------------
   // ADDING STREAMS API
   // --------------------------------------------------------------------------
@@ -284,12 +288,19 @@ class VideoDecoder {
 
   struct FrameInfo {
     int64_t pts = 0;
-    // The value of this default is important: the last frame's nextPts will be
-    // INT64_MAX, which ensures that the allFrames vec contains FrameInfo
-    // structs with *increasing* nextPts values. That's a necessary condition
-    // for the binary searches on those values to work properly (as typically
-    // done during pts -> index conversions.)
+
+    // The value of the nextPts default is important: the last frame's nextPts
+    // will be INT64_MAX, which ensures that the allFrames vec contains
+    // FrameInfo structs with *increasing* nextPts values. That's a necessary
+    // condition for the binary searches on those values to work properly (as
+    // typically done during pts -> index conversions).
     int64_t nextPts = INT64_MAX;
+
+    // Note that frameIndex is ALWAYS the index into all of the frames in that
+    // stream, even when the FrameInfo is part of the key frame index. Given a
+    // FrameInfo for a key frame, the frameIndex allows us to know which frame
+    // that is in the stream.
+    int64_t frameIndex = 0;
   };
 
   struct FilterGraphContext {
@@ -361,8 +372,7 @@ class VideoDecoder {
 
   void maybeSeekToBeforeDesiredPts();
 
-  AVFrameStream decodeAVFrame(
-      std::function<bool(int, AVFrame*)> filterFunction);
+  AVFrameStream decodeAVFrame(std::function<bool(AVFrame*)> filterFunction);
 
   FrameOutput getNextFrameNoDemuxInternal(
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
@@ -469,9 +479,8 @@ class VideoDecoder {
   ContainerMetadata containerMetadata_;
   UniqueAVFormatContext formatContext_;
   std::map<int, StreamInfo> streamInfos_;
-  // Stores the stream indices of the active streams, i.e. the streams we are
-  // decoding and returning to the user.
-  std::set<int> activeStreamIndices_;
+  const int NO_ACTIVE_STREAM = -2;
+  int activeStreamIndex_ = NO_ACTIVE_STREAM;
   // Set when the user wants to seek and stores the desired pts that the user
   // wants to seek to.
   std::optional<double> desiredPtsSeconds_;
 
@@ -48,6 +48,8 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "get_frames_by_pts_in_range(Tensor(a!) decoder, *, int stream_index, float start_seconds, float stop_seconds) -> (Tensor, Tensor, Tensor)");
   m.def(
       "get_frames_by_pts(Tensor(a!) decoder, *, int stream_index, float[] timestamps) -> (Tensor, Tensor, Tensor)");
+  m.def(
+      "_get_key_frame_indices(Tensor(a!) decoder, int stream_index) -> Tensor");
   m.def("get_json_metadata(Tensor(a!) decoder) -> str");
   m.def("get_container_json_metadata(Tensor(a!) decoder) -> str");
   m.def(
@@ -334,6 +336,13 @@ bool _test_frame_pts_equality(
       videoDecoder->getPtsSecondsForFrame(stream_index, frame_index);
 }
 
+torch::Tensor _get_key_frame_indices(
+    at::Tensor& decoder,
+    int64_t stream_index) {
+  auto videoDecoder = unwrapTensorToGetDecoder(decoder);
+  return videoDecoder->getKeyFrameIndices(stream_index);
+}
+
 std::string get_json_metadata(at::Tensor& decoder) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
 
@@ -526,6 +535,7 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
   m.impl("add_video_stream", &add_video_stream);
   m.impl("_add_video_stream", &_add_video_stream);
   m.impl("get_next_frame", &get_next_frame);
+  m.impl("_get_key_frame_indices", &_get_key_frame_indices);
   m.impl("get_json_metadata", &get_json_metadata);
   m.impl("get_container_json_metadata", &get_container_json_metadata);
   m.impl("get_stream_json_metadata", &get_stream_json_metadata);
 
@@ -137,6 +137,8 @@ bool _test_frame_pts_equality(
     int64_t frame_index,
     double pts_seconds_to_test);
 
+torch::Tensor _get_key_frame_indices(at::Tensor& decoder, int64_t stream_index);
+
 // Get the metadata from the video as a string.
 std::string get_json_metadata(at::Tensor& decoder);
 
 
@@ -13,6 +13,7 @@
 )
 from .video_decoder_ops import (
     _add_video_stream,
+    _get_key_frame_indices,
     _test_frame_pts_equality,
     add_video_stream,
     create_from_bytes,
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`	`)`
`14`	`14`	`from .video_decoder_ops import (`
`15`	`15`	`_add_video_stream,`
	`16`	`+ _get_key_frame_indices,`
`16`	`17`	`_test_frame_pts_equality,`
`17`	`18`	`add_video_stream,`
`18`	`19`	`create_from_bytes,`