Merge branch 'main' of github.com:pytorch/torchcodec into fltp

NicolasHug · NicolasHug · commit 3da223c7b139 · 2025-03-17T13:25:05.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -468,6 +468,7 @@ void VideoDecoder::addStream(
   TORCH_CHECK_EQ(retVal, AVSUCCESS);
 
   streamInfo.codecContext->thread_count = ffmpegThreadCount.value_or(0);
+  streamInfo.codecContext->pkt_timebase = streamInfo.stream->time_base;
 
   // TODO_CODE_QUALITY same as above.
   if (mediaType == AVMEDIA_TYPE_VIDEO && device.type() == torch::kCUDA) {
@@ -573,13 +574,15 @@ void VideoDecoder::addAudioStream(int streamIndex) {
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrame() {
   auto output = getNextFrameInternal();
-  output.data = maybePermuteHWC2CHW(output.data);
+  if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO) {
+    output.data = maybePermuteHWC2CHW(output.data);
+  }
   return output;
 }
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  validateActiveStream(AVMEDIA_TYPE_VIDEO);
+  validateActiveStream();
   AVFrameStream avFrameStream = decodeAVFrame(
       [this](AVFrame* avFrame) { return avFrame->pts >= cursor_; });
   return convertAVFrameToFrameOutput(avFrameStream, preAllocatedOutputTensor);
@@ -875,7 +878,7 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
     // If we need to seek backwards, then we have to seek back to the beginning
     // of the stream.
     // TODO-AUDIO: document why this is needed in a big comment.
-    setCursorPtsInSeconds(INT64_MIN);
+    setCursorPtsInSecondsInternal(INT64_MIN);
   }
 
   // TODO-AUDIO Pre-allocate a long-enough tensor instead of creating a vec +
@@ -921,6 +924,11 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
 // --------------------------------------------------------------------------
 
 void VideoDecoder::setCursorPtsInSeconds(double seconds) {
+  validateActiveStream(AVMEDIA_TYPE_VIDEO);
+  setCursorPtsInSecondsInternal(seconds);
+}
+
+void VideoDecoder::setCursorPtsInSecondsInternal(double seconds) {
   cursorWasJustSet_ = true;
   cursor_ =
       secondsToClosestPts(seconds, streamInfos_[activeStreamIndex_].timeBase);
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -371,6 +371,7 @@ class VideoDecoder {
   // DECODING APIS AND RELATED UTILS
   // --------------------------------------------------------------------------
 
+  void setCursorPtsInSecondsInternal(double seconds);
   bool canWeAvoidSeeking() const;
 
   void maybeSeekToBeforeDesiredPts();
diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py
@@ -990,13 +990,7 @@ def test_get_all_samples(self, asset, stop_seconds):
         torch.testing.assert_close(samples.data, reference_frames)
         assert samples.sample_rate == asset.sample_rate
 
-        # TODO there's a bug with NASA_AUDIO_MP3: https://github.com/pytorch/torchcodec/issues/553
-        expected_pts = (
-            0.072
-            if asset is NASA_AUDIO_MP3
-            else asset.get_frame_info(idx=0).pts_seconds
-        )
-        assert samples.pts_seconds == expected_pts
+        assert samples.pts_seconds == asset.get_frame_info(idx=0).pts_seconds
 
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
     def test_at_frame_boundaries(self, asset):
@@ -1060,12 +1054,8 @@ def test_start_equals_stop(self, asset):
         assert samples.data.shape == (0, 0)
 
     def test_frame_start_is_not_zero(self):
-        # For NASA_AUDIO_MP3, the first frame is not at 0, it's at 0.072 [1].
+        # For NASA_AUDIO_MP3, the first frame is not at 0, it's at 0.138125.
         # So if we request start = 0.05, we shouldn't be truncating anything.
-        #
-        # [1] well, really it's at 0.138125, not 0.072 (see
-        # https://github.com/pytorch/torchcodec/issues/553), but for the purpose
-        # of this test it doesn't matter.
 
         asset = NASA_AUDIO_MP3
         start_seconds = 0.05  # this is less than the first frame's pts
diff --git a/test/decoders/test_ops.py b/test/decoders/test_ops.py
@@ -626,7 +626,7 @@ class TestAudioOps:
             partial(get_frames_in_range, start=4, stop=5),
             partial(get_frame_at_pts, seconds=2),
             partial(get_frames_by_pts, timestamps=[0, 1.5]),
-            partial(get_next_frame),
+            partial(seek_to_pts, seconds=5),
         ),
     )
     def test_audio_bad_method(self, method):
@@ -642,6 +642,22 @@ def test_audio_bad_seek_mode(self):
         ):
             add_audio_stream(decoder)
 
+    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
+    def test_next(self, asset):
+        decoder = create_from_file(str(asset.path), seek_mode="approximate")
+        add_audio_stream(decoder)
+
+        frame_index = 0
+        while True:
+            try:
+                frame, *_ = get_next_frame(decoder)
+            except IndexError:
+                break
+            torch.testing.assert_close(
+                frame, asset.get_frame_data_by_index(frame_index)
+            )
+            frame_index += 1
+
     @pytest.mark.parametrize(
         "range",
         (
@@ -826,6 +842,8 @@ def get_reference_frames(start_seconds, stop_seconds):
 
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
     def test_pts(self, asset):
+        # Non-regression test for
+        # https://github.com/pytorch/torchcodec/issues/553
         decoder = create_from_file(str(asset.path), seek_mode="approximate")
         add_audio_stream(decoder)
 
@@ -840,15 +858,24 @@ def test_pts(self, asset):
                 frames, asset.get_frame_data_by_index(frame_index)
             )
 
-            if asset is NASA_AUDIO_MP3 and frame_index == 0:
-                # TODO This is a bug. The 0.138125 is correct while 0.072 is
-                # incorrect, even though it comes from the decoded AVFrame's pts
-                # field.
-                # See https://github.com/pytorch/torchcodec/issues/553
-                assert pts_seconds == 0.072
-                assert start_seconds == 0.138125
-            else:
-                assert pts_seconds == start_seconds
+            assert pts_seconds == start_seconds
+
+    def test_decode_before_frame_start(self):
+        # Test illustrating bug described in
+        # https://github.com/pytorch/torchcodec/issues/567
+        asset = NASA_AUDIO_MP3
+
+        decoder = create_from_file(str(asset.path), seek_mode="approximate")
+        add_audio_stream(decoder)
+
+        frames, *_ = get_frames_by_pts_in_range_audio(
+            decoder, start_seconds=0, stop_seconds=0.05
+        )
+        all_frames, *_ = get_frames_by_pts_in_range_audio(
+            decoder, start_seconds=0, stop_seconds=None
+        )
+        # TODO fix this. `frames` should be empty.
+        torch.testing.assert_close(frames, all_frames)
 
 
 if __name__ == "__main__":