tons of comments

NicolasHug · NicolasHug · commit 6c7e31f36569 · 2025-02-17T15:22:31.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -877,6 +877,13 @@ bool VideoDecoder::canWeAvoidSeekingAudio(double desiredPtsSeconds) const {
     return false;
   }
 
+  // We can skip seeking if we want to decoder frame `i` and we just decoded
+  // frame `i - 1`. Note this involves a `log(numFrames)` complexity for each
+  // decoded frame.
+  // TODO we should bypass this log(numFrames) logic when calling range APIs
+  // where the step is 1, because we are sure in this case that all frames
+  // (except the first one) are consecutive. See a POC at
+  // https://github.com/pytorch/torchcodec/pull/514
   double lastDecodedAvFramePtsSeconds =
       ptsToSeconds(lastDecodedAvFramePts, streamInfo.timeBase);
   int64_t lastDecodedAvFrameIndex =
@@ -972,17 +979,81 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
     desiredPts = streamInfo.keyFrames[desiredKeyFrameIndex].pts;
   }
 
-  // TODO explain this nasty hack
-  // This probably only works if the desired pts corresponds exactly to a frame
-  // start.
-  int64_t offset = avMediaType == AVMEDIA_TYPE_VIDEO ? 0 : -1;
+  if (avMediaType == AVMEDIA_TYPE_AUDIO) {
+    desiredPts -= 1;
+    // Note [Seek offset for audio]
+    //
+    // There is a strange FFmpeg behavior when decoding audio frames: seeking at
+    // a frame start and then flushing buffers with avcodec_flush_buffers (as is
+    // recommended by the FFmpeg docs) leads to the samples to be decoded
+    // incorrectly. It's difficult to really determine what's going on, but the
+    // fact is that there exist a data dependency between frames: for frame `i`
+    // to be correct, then the packet of frame `i-1` needs to be sent to the
+    // decoder, and there must be no flushing in-between. The naive (and
+    // incorrect) fix of just *not* flushing only works when we're decoding
+    // consecutive frames, but fails when decoding non-consecutive frames. We
+    // try to mitigate this issue via two different means:
+    // - A. We try to avoid seeking (and thus flushing) as much as possible.
+    //   Typically, we don't need to seek if we want frame `i` and we just
+    //   decoded frame `i - 1`: we just need to return the next frame. This
+    //   happens in the logic of `canWeAvoidSeekingAudio()`.
+    // - B. Instead of seeking to desiredPts, we seek to desiredPts - 1.
+    //   Effectively, this leads us to decode frame `i-1` before decoding frame
+    //   `i`. Our `filterFunction` logic in `decodeAVFrame()` ensures that we
+    //   are returning frame `i` (and not `i - 1`), and because we just decoded
+    //   frame `i-1`, frame `i` is correct.
+    //
+    // Strategy B works most of the time: in most decoding APIs, we first
+    // convert a frame's pts to an index, and then use that corresponding
+    // index's pts to decide where to seek. This means that `desiredPts` usually
+    // lands *exactly* where frame `i` starts, and `desiredPts - 1` is the last
+    // pts of frame `i-1`, so we do end up seeking (as desired) to frame `i-1`.
+    // But, there are cases where this offset trick won't work: if `desiredPts`
+    // isn't exactly at a frame's beginning. This corresponds to the following
+    // scenarios:
+    // - When calling any API in approximate mode *and* if the framerate isn't
+    //   constant. Because the framerate isn't constant, it's likely that the
+    //   index won't be correct, and that the index -> pts conversion won't land
+    //   exactly at a frame start either.
+    // - When calling `getFramePlayedAt(pts)`, regardless of the mode, if `pts`
+    //   doesn't land exactly at a frame's start. We have tests that currently
+    //   exhibit this behavior: test_get_frame_at_pts_audio_bad().
+    // TODO HOW DO WE FIX THIS??
+
+    // A few notes:
+    // - This offset trick does work for the first frame at pts=0: we'll seek to
+    //   -1, and this leads to a first packet with pts=-1024 to be sent to the
+    //   decoder (on our test data), leading to frame 0 to be correctly decoded.
+    // - The data dependency / buffer flushing issue can be observed on
+    //   compressed formats like aac or mp3. It doesn't happen on uncompressed
+    //   formats like wav, where the decoder's buffers are likely unused. We
+    //   could skip this entire logic for such formats.
+    // - All this *seems* to be related to this 13yo+ issue:
+    //   https://stackoverflow.com/questions/7989623/ffmpeg-seeking-brings-audio-artifacts
+    //   But according to the thread, the problem there (which has been fixed)
+    //   seemed to be **lack** of flushing.
+    // - So far we have only observed a data-dependency of 1 frame: we need to
+    //   decode frame `i-1`  to decode `i`. It's possible that there exist
+    //   longer data dependencies of more than 1 frame on other videos /
+    //   formats. We just haven't observed those yet. If this happens to be the
+    //   case, then we have a much harder problem to solve.
+    // - This weird FFmpeg behavior is observable not just in Torchcodec, it
+    //   really seems to be an FFmpeg thing. Other decoders have the same
+    //   problem, like the ones in TorchVision. Those who do not exhibit this
+    //   behavior are solving it in inefficient ways: Decord effectively decodes
+    //   and caches the *entire* file when it is created, thus resolving the
+    //   data dependency. Similarly, TorchAudio effectively always decodes all
+    //   frames up to frame `i`, even after seeking to frame `i`, because it
+    //   sets the 'backwards' flag when it calls `av_seek_frame`: it actually
+    //   always seeks back to the beginning.
+  }
 
   int ffmepgStatus = avformat_seek_file(
       formatContext_.get(),
       streamInfo.streamIndex,
       INT64_MIN,
-      desiredPts + offset,
-      desiredPts + offset,
+      desiredPts,
+      desiredPts,
       0);
 
   if (ffmepgStatus < 0) {
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -195,6 +195,29 @@ def test_get_frame_at_pts_audio(self, seek_mode):
         with pytest.raises(AssertionError):
             assert_frames_equal(next_frame, reference_frame6)
 
+    def test_get_frame_at_pts_audio_bad(self):
+        decoder = create_from_file(str(NASA_AUDIO.path))
+        add_audio_stream(decoder=decoder)
+
+        reference_frame6 = NASA_AUDIO.get_frame_data_by_index(
+            INDEX_OF_AUDIO_FRAME_AFTER_SEEKING_AT_6
+        )
+        frame6, _, _ = get_frame_at_pts(decoder, 6.05)
+        # See Note [Seek offset for audio].
+        # The frame played at 6.05 should be the reference frame, but because
+        # 6.05 isn't exactly the beginning of that frame, the samples are
+        # decoded incorrectly.
+        # TODO Fix this.
+        with pytest.raises(AssertionError):
+            assert_frames_equal(frame6, reference_frame6)
+
+        # And yet another quirk: if we try to decode it again, we actually end
+        # up with the samples being correctly decoded. This is because we have a
+        # custom logic within getFramePlayedAt() that resets desiredPts to the
+        # pts of the beginning of the frame in some very specific cases.
+        frame6, _, _ = get_frame_at_pts(decoder, 6.05)
+        assert_frames_equal(frame6, reference_frame6)
+
     @pytest.mark.parametrize("test_ref", (NASA_VIDEO, NASA_AUDIO))
     @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
@@ -779,34 +802,48 @@ def test_cuda_decoder(self):
         )
 
     def test_get_same_frame_twice(self):
+        # Non-regression tests that were useful while developing audio support.
         def make_decoder():
             decoder = create_from_file(str(NASA_AUDIO.path))
             add_audio_stream(decoder)
             return decoder
 
         for frame_index in (0, 10, 15):
+            ref = NASA_AUDIO.get_frame_data_by_index(frame_index)
+
             decoder = make_decoder()
             a = get_frame_at_index(decoder, frame_index=frame_index)
             b = get_frame_at_index(decoder, frame_index=frame_index)
             torch.testing.assert_close(a, b)
+            torch.testing.assert_close(a[0], ref)
 
             decoder = make_decoder()
             a = get_frames_at_indices(decoder, frame_indices=[frame_index])
             b = get_frames_at_indices(decoder, frame_indices=[frame_index])
             torch.testing.assert_close(a, b)
+            torch.testing.assert_close(a[0][0], ref)
 
             decoder = make_decoder()
             a = get_frames_in_range(decoder, start=frame_index, stop=frame_index + 1)
             b = get_frames_in_range(decoder, start=frame_index, stop=frame_index + 1)
             torch.testing.assert_close(a, b)
+            torch.testing.assert_close(a[0][0], ref)
 
-        pts_at_frame_start = 0
+        pts_at_frame_start = 0  # 0 corresponds exactly to a frame start
+        index_of_frame_at_0 = 0
         pts_not_at_frame_start = 2  # second 2 is in the middle of a frame
-        for pts in (pts_at_frame_start, pts_not_at_frame_start):
+        index_of_frame_at_2 = 31
+        for pts, frame_index in (
+            (pts_at_frame_start, index_of_frame_at_0),
+            (pts_not_at_frame_start, index_of_frame_at_2),
+        ):
+            ref = NASA_AUDIO.get_frame_data_by_index(frame_index)
+
             decoder = make_decoder()
             a = get_frames_by_pts(decoder, timestamps=[pts])
             b = get_frames_by_pts(decoder, timestamps=[pts])
             torch.testing.assert_close(a, b)
+            torch.testing.assert_close(a[0][0], ref)
 
             decoder = make_decoder()
             a = get_frames_by_pts_in_range(
@@ -816,11 +853,15 @@ def make_decoder():
                 decoder, start_seconds=pts, stop_seconds=pts + 1e-4
             )
             torch.testing.assert_close(a, b)
+            torch.testing.assert_close(a[0][0], ref)
 
         decoder = make_decoder()
         a = get_frame_at_pts(decoder, seconds=pts_at_frame_start)
         b = get_frame_at_pts(decoder, seconds=pts_at_frame_start)
         torch.testing.assert_close(a, b)
+        torch.testing.assert_close(
+            a[0], NASA_AUDIO.get_frame_data_by_index(index_of_frame_at_0)
+        )
 
         decoder = make_decoder()
         a_frame, a_pts, a_duration = get_frame_at_pts(
@@ -831,8 +872,17 @@ def make_decoder():
         )
         torch.testing.assert_close(a_pts, b_pts)
         torch.testing.assert_close(a_duration, b_duration)
+        # TODO fix this. These checks should pass
         with pytest.raises(AssertionError):
             torch.testing.assert_close(a_frame, b_frame)
+        with pytest.raises(AssertionError):
+            torch.testing.assert_close(
+                a_frame, NASA_AUDIO.get_frame_data_by_index(index_of_frame_at_2)
+            )
+        # But second time works ¯\_(ツ)_/¯A (see also test_get_frame_at_pts_audio_bad())
+        torch.testing.assert_close(
+            b_frame, NASA_AUDIO.get_frame_data_by_index(index_of_frame_at_2)
+        )
 
         decoder = make_decoder()
         seek_to_pts(decoder, pts_at_frame_start)
@@ -841,13 +891,15 @@ def make_decoder():
         b = get_next_frame(decoder)
         torch.testing.assert_close(a, b)
 
-        # TODO: Wait WTFFF, this should not pass
         decoder = make_decoder()
         seek_to_pts(decoder, seconds=pts_not_at_frame_start)
         a = get_next_frame(decoder)
         seek_to_pts(decoder, seconds=pts_not_at_frame_start)
         b = get_next_frame(decoder)
         torch.testing.assert_close(a, b)
+        torch.testing.assert_close(
+            a[0], NASA_AUDIO.get_frame_data_by_index(index_of_frame_at_2 + 1)
+        )
 
 
 if __name__ == "__main__":