Add correct support for getFramesPlayedInRange

NicolasHug · NicolasHug · commit 8a4a4440cbde · 2025-03-04T14:18:41.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -787,6 +787,22 @@ VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedInRange(
     double stopSeconds) {
   validateActiveStream();
 
+  // Because we currently never seek with audio streams, we prevent users from
+  // calling this method twice. We could allow multiple calls in the future.
+  // Assuming 2 consecutive calls:
+  // ```
+  // getFramesPlayedInRange(startSeconds1, stopSeconds1);
+  // getFramesPlayedInRange(startSeconds2, stopSeconds2);
+  // ```
+  // We would need to seek back to 0 iff startSeconds2 <= stopSeconds1. This
+  // logic is not implemented for now, so we just error.
+
+  TORCH_CHECK(
+      streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO ||
+          !alreadyCalledGetFramesPlayedInRange_,
+      "Can only decode once with audio stream. Re-create a decoder object if needed.")
+  alreadyCalledGetFramesPlayedInRange_ = true;
+
   TORCH_CHECK(
       startSeconds <= stopSeconds,
       "Start seconds (" + std::to_string(startSeconds) +
@@ -869,30 +885,6 @@ void VideoDecoder::setCursorPtsInSeconds(double seconds) {
   desiredPtsSeconds_ = seconds;
 }
 
-bool VideoDecoder::canWeAvoidSeekingAudio(double desiredPtsSeconds) const {
-  const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
-  int64_t targetPts = *desiredPtsSeconds_ * streamInfo.timeBase.den;
-  int64_t lastDecodedAvFramePts = streamInfo.lastDecodedAvFramePts;
-
-  if (targetPts <= lastDecodedAvFramePts) {
-    return false;
-  }
-
-  // We can skip seeking if we want to decoder frame `i` and we just decoded
-  // frame `i - 1`. Note this involves a `log(numFrames)` complexity for each
-  // decoded frame.
-  // TODO we should bypass this log(numFrames) logic when calling range APIs
-  // where the step is 1, because we are sure in this case that all frames
-  // (except the first one) are consecutive. See a POC at
-  // https://github.com/pytorch/torchcodec/pull/514
-  double lastDecodedAvFramePtsSeconds =
-      ptsToSeconds(lastDecodedAvFramePts, streamInfo.timeBase);
-  int64_t lastDecodedAvFrameIndex =
-      secondsToIndexLowerBound(lastDecodedAvFramePtsSeconds);
-  int64_t targetFrameIndex = secondsToIndexLowerBound(desiredPtsSeconds);
-  return (lastDecodedAvFrameIndex + 1 == targetFrameIndex);
-}
-
 /*
 Videos have I frames and non-I frames (P and B frames). Non-I frames need data
 from the previous I frame to be decoded.
@@ -918,9 +910,13 @@ I    P     P    P    I    P    P    P    I    P    P    I    P    P    I    P
 
 (2) is more efficient than (1) if there is an I frame between x and y.
 */
-bool VideoDecoder::canWeAvoidSeekingVideo(int64_t targetPts) const {
-  int64_t lastDecodedAvFramePts =
-      streamInfos_.at(activeStreamIndex_).lastDecodedAvFramePts;
+bool VideoDecoder::canWeAvoidSeeking(int64_t targetPts) const {
+  const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
+  if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
+    return true;
+  }
+
+  int64_t lastDecodedAvFramePts = streamInfo.lastDecodedAvFramePts;
   if (targetPts < lastDecodedAvFramePts) {
     // We can never skip a seek if we are seeking backwards.
     return false;
@@ -954,16 +950,7 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
 
   decodeStats_.numSeeksAttempted++;
 
-  // TODO_CODE_QUALITY The different signature is unfortunate
-  bool canAvoidSeeking = false;
-  auto avMediaType = streamInfos_.at(activeStreamIndex_).avMediaType;
-  if (avMediaType == AVMEDIA_TYPE_AUDIO) {
-    canAvoidSeeking = canWeAvoidSeekingAudio(*desiredPtsSeconds_);
-  } else {
-    canAvoidSeeking = canWeAvoidSeekingVideo(desiredPts);
-  }
-
-  if (canAvoidSeeking) {
+  if (canWeAvoidSeeking(desiredPts)) {
     decodeStats_.numSeeksSkipped++;
     return;
   }
@@ -973,85 +960,13 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
   // the key frame that we want to seek to.
   // See https://github.com/pytorch/torchcodec/issues/179 for more details.
   // See https://trac.ffmpeg.org/ticket/11137 for the underlying ffmpeg bug.
-  if (avMediaType == AVMEDIA_TYPE_VIDEO && !streamInfo.keyFrames.empty()) {
+  if (!streamInfo.keyFrames.empty()) {
     int desiredKeyFrameIndex = getKeyFrameIndexForPtsUsingScannedIndex(
         streamInfo.keyFrames, desiredPts);
     desiredKeyFrameIndex = std::max(desiredKeyFrameIndex, 0);
     desiredPts = streamInfo.keyFrames[desiredKeyFrameIndex].pts;
   }
 
-  if (avMediaType == AVMEDIA_TYPE_AUDIO) {
-    desiredPts -= 1;
-    // Note [Seek offset for audio]
-    //
-    // There is a strange FFmpeg behavior when decoding audio frames: seeking at
-    // a frame start and then flushing buffers with avcodec_flush_buffers (as is
-    // recommended by the FFmpeg docs) leads to the samples to be decoded
-    // incorrectly. It's difficult to really determine what's going on, but the
-    // fact is that there exist a data dependency between frames: for frame `i`
-    // to be correct, then the packet of frame `i-1` needs to be sent to the
-    // decoder, and there must be no flushing in-between. The naive (and
-    // incorrect) fix of just *not* flushing only works when we're decoding
-    // consecutive frames, but fails when decoding non-consecutive frames. We
-    // try to mitigate this issue via two different means:
-    // - A. We try to avoid seeking (and thus flushing) as much as possible.
-    //   Typically, we don't need to seek if we want frame `i` and we just
-    //   decoded frame `i - 1`: we just need to return the next frame. This
-    //   happens in the logic of `canWeAvoidSeekingAudio()`.
-    // - B. Instead of seeking to desiredPts, we seek to desiredPts - 1.
-    //   Effectively, this leads us to decode frame `i-1` before decoding frame
-    //   `i`. Our `filterFunction` logic in `decodeAVFrame()` ensures that we
-    //   are returning frame `i` (and not `i - 1`), and because we just decoded
-    //   frame `i-1`, frame `i` is correct.
-    //
-    // Strategy B works most of the time: in most decoding APIs, we first
-    // convert a frame's pts to an index, and then use that corresponding
-    // index's pts to decide where to seek. This means that `desiredPts` usually
-    // lands *exactly* where frame `i` starts, and `desiredPts - 1` is the last
-    // pts of frame `i-1`, so we do end up seeking (as desired) to frame `i-1`.
-    // But, there are cases where this offset trick won't work: if `desiredPts`
-    // isn't exactly at a frame's beginning. This corresponds to the following
-    // scenarios:
-    // - When calling any API in approximate mode *and* if the framerate isn't
-    //   constant. Because the framerate isn't constant, it's likely that the
-    //   index won't be correct, and that the index -> pts conversion won't land
-    //   exactly at a frame start either.
-    // - When calling `getFramePlayedAt(pts)`, regardless of the mode, if `pts`
-    //   doesn't land exactly at a frame's start. We have tests that currently
-    //   exhibit this behavior: test_get_frame_at_pts_audio_bad(). The "obvious"
-    //   fix for this is to let `getFramePlayedAt` convert the pts to an index,
-    //   just like the rest of the APIs.
-    //
-    // TODO HOW DO WE ADDRESS THIS??
-    //
-    // A few more notes:
-    // - This offset trick does work for the first frame at pts=0: we'll seek to
-    //   -1, and this leads to a first packet with pts=-1024 to be sent to the
-    //   decoder (on our test data), leading to frame 0 to be correctly decoded.
-    // - The data dependency / buffer flushing issue can be observed on
-    //   compressed formats like aac or mp3. It doesn't happen on uncompressed
-    //   formats like wav, where the decoder's buffers are likely unused. We
-    //   could skip this entire logic for such formats.
-    // - All this *seems* to be related to this 13yo+ issue:
-    //   https://stackoverflow.com/questions/7989623/ffmpeg-seeking-brings-audio-artifacts
-    //   But according to the thread, the problem there (which has been fixed)
-    //   seemed to be **lack** of flushing.
-    // - So far we have only observed a data-dependency of 1 frame: we need to
-    //   decode frame `i-1`  to decode `i`. It's possible that there exist
-    //   longer data dependencies of more than 1 frame on other videos /
-    //   formats. We just haven't observed those yet. If this happens to be the
-    //   case, then we have a much harder problem to solve.
-    // - This weird FFmpeg behavior is observable not just in Torchcodec, it
-    //   really seems to be an FFmpeg thing. Other decoders have the same
-    //   problem, like the ones in TorchVision. Those who do not exhibit this
-    //   behavior are solving it in inefficient ways: Decord effectively decodes
-    //   and caches the *entire* file when it is created, thus resolving the
-    //   data dependency. Similarly, TorchAudio effectively always decodes all
-    //   frames up to frame `i`, even after seeking to frame `i`, because it
-    //   sets the 'backwards' flag when it calls `av_seek_frame`: it actually
-    //   always seeks back to the beginning.
-  }
-
   int ffmepgStatus = avformat_seek_file(
       formatContext_.get(),
       streamInfo.streamIndex,
@@ -1130,6 +1045,7 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
       if (ffmpegStatus == AVERROR_EOF) {
         // End of file reached. We must drain the codec by sending a nullptr
         // packet.
+
         ffmpegStatus = avcodec_send_packet(
             streamInfo.codecContext.get(),
             /*avpkt=*/nullptr);
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -365,8 +365,7 @@ class VideoDecoder {
   // DECODING APIS AND RELATED UTILS
   // --------------------------------------------------------------------------
 
-  bool canWeAvoidSeekingVideo(int64_t targetPts) const;
-  bool canWeAvoidSeekingAudio(double desiredPtsSeconds) const;
+  bool canWeAvoidSeeking(int64_t targetPts) const;
 
   void maybeSeekToBeforeDesiredPts();
 
@@ -487,6 +486,7 @@ class VideoDecoder {
   bool scannedAllStreams_ = false;
   // Tracks that we've already been initialized.
   bool initialized_ = false;
+  bool alreadyCalledGetFramesPlayedInRange_ = false;
 };
 
 // --------------------------------------------------------------------------
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -232,6 +232,13 @@ void add_audio_stream(
 }
 
 void seek_to_pts(at::Tensor& decoder, double seconds) {
+  // TODO we should prevent more than one call to this op for audio streams, for
+  // the same reasons we do so for getFramesPlayedInRange(). But we can't
+  // implement the logic here, because we don't know media type (audio vs
+  // video). We also can't do it within setCursorPtsInSeconds because it's used
+  // by all other decoding methods.
+  // This isn't un-doable, just not easy with the API we currently have.
+
   auto videoDecoder = static_cast<VideoDecoder*>(decoder.mutable_data_ptr());
   videoDecoder->setCursorPtsInSeconds(seconds);
 }
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -638,6 +638,94 @@ def test_audio_bad_method(self, method):
         ):
             method(decoder)
 
+    @pytest.mark.parametrize(
+        "start_seconds, stop_seconds",
+        (
+            # Beginning to end
+            (0, 13.05),
+            # At frames boundaries. Frame duration is exactly 0.064 seconds for
+            # NASA_AUDIO. Need artifial -1e-5 for upper-bound to align the
+            # reference_frames with the frames returned by the decoder, where
+            # the interval is half-open.
+            (0.064 * 4, 0.064 * 20 - 1e-5),
+            # Not at frames boundaries
+            (2, 4),
+        ),
+    )
+    def test_audio_get_frames_by_pts_in_range(self, start_seconds, stop_seconds):
+        decoder = create_from_file(str(NASA_AUDIO.path))
+        add_audio_stream(decoder)
+
+        reference_frames = NASA_AUDIO.get_frame_data_by_range(
+            start=NASA_AUDIO.pts_to_frame_index(start_seconds),
+            stop=NASA_AUDIO.pts_to_frame_index(stop_seconds) + 1,
+        )
+        frames, _, _ = get_frames_by_pts_in_range(
+            decoder, start_seconds=start_seconds, stop_seconds=stop_seconds
+        )
+
+        assert_frames_equal(frames, reference_frames)
+
+    def test_audio_get_frames_by_pts_in_range_multiple_calls(self):
+        decoder = create_from_file(str(NASA_AUDIO.path))
+        add_audio_stream(decoder)
+
+        get_frames_by_pts_in_range(decoder, start_seconds=0, stop_seconds=1)
+        with pytest.raises(
+            RuntimeError, match="Can only decode once with audio stream"
+        ):
+            get_frames_by_pts_in_range(decoder, start_seconds=0, stop_seconds=1)
+
+    def test_audio_seek_and_next(self):
+        decoder = create_from_file(str(NASA_AUDIO.path))
+        add_audio_stream(decoder)
+
+        pts = 2
+        # Need +1 because we're not at frames boundaries
+        reference_frame = NASA_AUDIO.get_frame_data_by_index(
+            NASA_AUDIO.pts_to_frame_index(pts) + 1
+        )
+        seek_to_pts(decoder, pts)
+        frame, _, _ = get_next_frame(decoder)
+        assert_frames_equal(frame, reference_frame)
+
+        # Seeking forward is OK
+        pts = 4
+        reference_frame = NASA_AUDIO.get_frame_data_by_index(
+            NASA_AUDIO.pts_to_frame_index(pts) + 1
+        )
+        seek_to_pts(decoder, pts)
+        frame, _, _ = get_next_frame(decoder)
+        assert_frames_equal(frame, reference_frame)
+
+        # Seeking backwards doesn't error, but it's wrong. See TODO in
+        # `seek_to_pts` op.
+        prev_pts = pts
+        pts = 1
+        seek_to_pts(decoder, pts)
+        frame, _, _ = get_next_frame(decoder)
+        # the decoder actually didn't seek, so the frame we're getting is just
+        # the "next: one without seeking. This assertion exists to illutrate
+        # what currently hapens, but it's obviously *wrong*.
+        reference_frame = NASA_AUDIO.get_frame_data_by_index(
+            NASA_AUDIO.pts_to_frame_index(prev_pts) + 2
+        )
+        assert_frames_equal(frame, reference_frame)
+
+    # def test_audio_seek_and_next_backwards(self):
+    #     decoder = create_from_file(str(NASA_AUDIO.path))
+    #     add_audio_stream(decoder)
+
+    #     for pts in (4.5, 2):
+    #         # Need +1 because we're not at frames boundaries
+    #         reference_frame = NASA_AUDIO.get_frame_data_by_index(NASA_AUDIO.pts_to_frame_index(pts) + 1)
+    #         seek_to_pts(decoder, pts)
+    #         frame, _, _ = get_next_frame(decoder)
+    #         # assert_frames_equal(frame, reference_frame)
+
+    #     reference_frame = NASA_AUDIO.get_frame_data_by_index(NASA_AUDIO.pts_to_frame_index(4.5) + 2)
+    #     assert_frames_equal(frame, reference_frame)
+
 
 if __name__ == "__main__":
     pytest.main()
diff --git a/test/utils.py b/test/utils.py
@@ -356,6 +356,13 @@ def get_frame_data_by_index(
 
         return self._reference_frames[idx]
 
+    def pts_to_frame_index(self, pts_seconds: float) -> int:
+        # These are hard-coded value assuming stream 4 of nasa_13013.mp4. Each
+        # of the 204 frames contains 1024 samples.
+        # TODO make this more generic
+        frame_duration_seconds = 0.064
+        return int(pts_seconds // frame_duration_seconds)
+
     # TODO: this shouldn't be named chw. Also values are hard-coded
     @property
     def empty_chw_tensor(self) -> torch.Tensor:

Original file line number	Diff line number	Diff line change
`@@ -232,6 +232,13 @@ void add_audio_stream(`
`232`	`232`	`}`
`233`	`233`
`234`	`234`	`void seek_to_pts(at::Tensor& decoder, double seconds) {`
	`235`	`+ // TODO we should prevent more than one call to this op for audio streams, for`
	`236`	`+ // the same reasons we do so for getFramesPlayedInRange(). But we can't`
	`237`	`+ // implement the logic here, because we don't know media type (audio vs`
	`238`	`+ // video). We also can't do it within setCursorPtsInSeconds because it's used`
	`239`	`+ // by all other decoding methods.`
	`240`	`+ // This isn't un-doable, just not easy with the API we currently have.`
	`241`	`+`
`235`	`242`	`auto videoDecoder = static_cast<VideoDecoder*>(decoder.mutable_data_ptr());`
`236`	`243`	`videoDecoder->setCursorPtsInSeconds(seconds);`
`237`	`244`	`}`