Handle approximate mode. Sort of.

NicolasHug · NicolasHug · commit 0f92d60c7ccb · 2025-02-16T15:02:04.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -156,6 +156,9 @@ void VideoDecoder::initializeDecoder() {
         "Our stream index, " + std::to_string(i) +
             ", does not match AVStream's index, " +
             std::to_string(avStream->index) + ".");
+
+    // TODO figure out audio metadata
+
     streamMetadata.streamIndex = i;
     streamMetadata.mediaType = avStream->codecpar->codec_type;
     streamMetadata.codecName = avcodec_get_name(avStream->codecpar->codec_id);
@@ -171,12 +174,22 @@ void VideoDecoder::initializeDecoder() {
           av_q2d(avStream->time_base) * avStream->duration;
     }
 
-    double fps = av_q2d(avStream->r_frame_rate);
-    if (fps > 0) {
-      streamMetadata.averageFps = fps;
-    }
-
     if (avStream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
+      double fps = av_q2d(avStream->r_frame_rate);
+      if (fps > 0) {
+        streamMetadata.averageFps = fps;
+      }
+    } else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
+      int numSamplesPerFrame = avStream->codecpar->frame_size;
+      int sampleRate = avStream->codecpar->sample_rate;
+      if (numSamplesPerFrame > 0 && sampleRate > 0) {
+        // This should allow the approximate mode to do its magic.
+        // fps is numFrames / duration where
+        // - duration = numSamplesTotal / sampleRate and
+        // - numSamplesTotal = numSamplesPerFrame * numFrames
+        streamMetadata.averageFps =
+            static_cast<double>(sampleRate) / numSamplesPerFrame;
+      }
       containerMetadata_.numVideoStreams++;
     } else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
       containerMetadata_.numAudioStreams++;
@@ -465,15 +478,27 @@ void VideoDecoder::addStream(
             .value_or(avCodec));
   }
 
-  // TODO figure out audio metadata
+  // TODO: For audio, we raise if seek_mode="approximate" and if the number of
+  // samples per frame is unknown (frame_size field of codec params). But that's
+  // quite limitting. Ultimately, the most common type of call will be to decode
+  // an entire file from start to end (possibly with some offsets for start and
+  // end). And for that, we shouldn't [need to] force the user to scan, because
+  // all this entails is a single call to seek(start) (if at all) and then just
+  // a bunch of consecutive calls to getNextFrame(). Maybe there should be a
+  // third seek mode for audio, e.g. seek_mode="contiguous" where we don't scan,
+  // and only allow calls to getFramesPlayedAt().
   StreamMetadata& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
   if (seekMode_ == SeekMode::approximate &&
       !streamMetadata.averageFps.has_value()) {
-    throw std::runtime_error(
-        "Seek mode is approximate, but stream " +
-        std::to_string(activeStreamIndex_) +
-        " does not have an average fps in its metadata.");
+    std::string errMsg = "Seek mode is approximate, but stream " +
+        std::to_string(activeStreamIndex_) + "does not have ";
+    if (mediaType == AVMEDIA_TYPE_VIDEO) {
+      errMsg += "an average fps in its metadata.";
+    } else {
+      errMsg += "a constant number of samples per frame.";
+    }
+    throw std::runtime_error(errMsg);
   }
 
   AVCodecContext* codecContext = avcodec_alloc_context3(avCodec);
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -109,10 +109,13 @@ def test_add_stream(self):
         ),
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_seek_and_next(self, test_ref, index_of_frame_after_seeking_at_6, device):
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
+    def test_seek_and_next(
+        self, test_ref, index_of_frame_after_seeking_at_6, device, seek_mode
+    ):
         if device == "cuda" and test_ref is NASA_AUDIO:
             pytest.skip(reason="CUDA decoding not supported for audio")
-        decoder = create_from_file(str(test_ref.path))
+        decoder = create_from_file(str(test_ref.path), seek_mode=seek_mode)
         _add_stream(decoder=decoder, test_ref=test_ref, device=device)
         frame0, _, _ = get_next_frame(decoder)
         reference_frame0 = test_ref.get_frame_data_by_index(0)
@@ -129,11 +132,12 @@ def test_seek_and_next(self, test_ref, index_of_frame_after_seeking_at_6, device
 
     @pytest.mark.parametrize("test_ref", (NASA_VIDEO, NASA_AUDIO))
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_seek_to_negative_pts(self, test_ref, device):
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
+    def test_seek_to_negative_pts(self, test_ref, device, seek_mode):
         if device == "cuda" and test_ref is NASA_AUDIO:
             pytest.skip(reason="CUDA decoding not supported for audio")
 
-        decoder = create_from_file(str(test_ref.path))
+        decoder = create_from_file(str(test_ref.path), seek_mode=seek_mode)
         _add_stream(decoder=decoder, test_ref=test_ref, device=device)
         frame0, _, _ = get_next_frame(decoder)
         reference_frame0 = test_ref.get_frame_data_by_index(0)
@@ -144,9 +148,10 @@ def test_seek_to_negative_pts(self, test_ref, device):
         assert_frames_equal(frame0, reference_frame0.to(device))
 
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_get_frame_at_pts_video(self, device):
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
+    def test_get_frame_at_pts_video(self, device, seek_mode):
 
-        decoder = create_from_file(str(NASA_VIDEO.path))
+        decoder = create_from_file(str(NASA_VIDEO.path), seek_mode=seek_mode)
         add_video_stream(decoder=decoder, device=device)
         # This frame has pts=6.006 and duration=0.033367, so it should be visible
         # at timestamps in the range [6.006, 6.039367) (not including the last timestamp).
@@ -168,8 +173,9 @@ def test_get_frame_at_pts_video(self, device):
             with pytest.raises(AssertionError):
                 assert_frames_equal(next_frame, reference_frame6.to(device))
 
-    def test_get_frame_at_pts_audio(self):
-        decoder = create_from_file(str(NASA_AUDIO.path))
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
+    def test_get_frame_at_pts_audio(self, seek_mode):
+        decoder = create_from_file(str(NASA_AUDIO.path), seek_mode=seek_mode)
         add_audio_stream(decoder=decoder)
         # This frame has pts=6.016 and duration=0.064 , so it should be played
         # at timestamps in the range [6.016, 6.08) (not including the last timestamp).
@@ -191,11 +197,12 @@ def test_get_frame_at_pts_audio(self):
 
     @pytest.mark.parametrize("test_ref", (NASA_VIDEO, NASA_AUDIO))
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_get_frame_at_index(self, test_ref, device):
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
+    def test_get_frame_at_index(self, test_ref, device, seek_mode):
         if device == "cuda" and test_ref is NASA_AUDIO:
             pytest.skip(reason="CUDA decoding not supported for audio")
 
-        decoder = create_from_file(str(test_ref.path))
+        decoder = create_from_file(str(test_ref.path), seek_mode=seek_mode)
         _add_stream(decoder=decoder, test_ref=test_ref, device=device)
         frame0, _, _ = get_frame_at_index(decoder, frame_index=0)
         reference_frame0 = test_ref.get_frame_data_by_index(0)
@@ -213,12 +220,13 @@ def test_get_frame_at_index(self, test_ref, device):
         ),
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_get_frame_with_info_at_index(
-        self, test_ref, expected_pts, expected_duration, device
+        self, test_ref, expected_pts, expected_duration, device, seek_mode
     ):
         if device == "cuda" and test_ref is NASA_AUDIO:
             pytest.skip(reason="CUDA decoding not supported for audio")
-        decoder = create_from_file(str(test_ref.path))
+        decoder = create_from_file(str(test_ref.path), seek_mode=seek_mode)
         _add_stream(decoder=decoder, test_ref=test_ref, device=device)
         frame6, pts, duration = get_frame_at_index(decoder, frame_index=180)
         reference_frame6 = test_ref.get_frame_data_by_index(180)
@@ -228,10 +236,11 @@ def test_get_frame_with_info_at_index(
 
     @pytest.mark.parametrize("test_ref", (NASA_VIDEO, NASA_AUDIO))
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_get_frames_at_indices(self, test_ref, device):
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
+    def test_get_frames_at_indices(self, test_ref, device, seek_mode):
         if device == "cuda" and test_ref is NASA_AUDIO:
             pytest.skip(reason="CUDA decoding not supported for audio")
-        decoder = create_from_file(str(test_ref.path))
+        decoder = create_from_file(str(test_ref.path), seek_mode=seek_mode)
         _add_stream(decoder=decoder, test_ref=test_ref, device=device)
         frames0and180, *_ = get_frames_at_indices(decoder, frame_indices=[0, 180])
         reference_frame0 = test_ref.get_frame_data_by_index(0)
@@ -242,11 +251,12 @@ def test_get_frames_at_indices(self, test_ref, device):
 
     @pytest.mark.parametrize("test_ref", (NASA_VIDEO, NASA_AUDIO))
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_get_frames_at_indices_unsorted_indices(self, test_ref, device):
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
+    def test_get_frames_at_indices_unsorted_indices(self, test_ref, device, seek_mode):
         if device == "cuda" and test_ref is NASA_AUDIO:
             pytest.skip(reason="CUDA decoding not supported for audio")
 
-        decoder = create_from_file(str(test_ref.path))
+        decoder = create_from_file(str(test_ref.path), seek_mode=seek_mode)
         _add_stream(decoder=decoder, test_ref=test_ref, device=device)
 
         frame_indices = [2, 0, 1, 0, 2]
@@ -272,8 +282,9 @@ def test_get_frames_at_indices_unsorted_indices(self, test_ref, device):
             assert_frames_equal(frames[0], frames[-1])
 
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_get_frames_by_pts(self, device):
-        decoder = create_from_file(str(NASA_VIDEO.path))
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
+    def test_get_frames_by_pts(self, device, seek_mode):
+        decoder = create_from_file(str(NASA_VIDEO.path), seek_mode=seek_mode)
         _add_video_stream(decoder=decoder, device=device)
 
         # Note: 13.01 should give the last video frame for the NASA video
@@ -361,10 +372,11 @@ def test_pts_apis_against_index_ref(self, test_ref, device):
 
     @pytest.mark.parametrize("test_ref", (NASA_VIDEO, NASA_AUDIO))
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_get_frames_in_range(self, test_ref, device):
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
+    def test_get_frames_in_range(self, test_ref, device, seek_mode):
         if device == "cuda" and test_ref is NASA_AUDIO:
             pytest.skip(reason="CUDA decoding not supported for audio")
-        decoder = create_from_file(str(test_ref.path))
+        decoder = create_from_file(str(test_ref.path), seek_mode=seek_mode)
         _add_stream(decoder=decoder, test_ref=test_ref, device=device)
 
         # ensure that the degenerate case of a range of size 1 works