More validation, more tests

NicolasHug · NicolasHug · commit ce12f03d1ffb · 2025-03-08T13:07:55.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -842,16 +842,24 @@ VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedInRange(
 torch::Tensor VideoDecoder::getFramesPlayedInRangeAudio(
     double startSeconds,
     double stopSeconds) {
+  TORCH_CHECK(
+      startSeconds <= stopSeconds,
+      "Start seconds (" + std::to_string(startSeconds) +
+          ") must be less than or equal to stop seconds (" +
+          std::to_string(stopSeconds) + ".");
+
   validateActiveStream(AVMEDIA_TYPE_AUDIO);
 
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
-  double frameStartTime =
-      ptsToSeconds(streamInfo.lastDecodedAvFramePts, streamInfo.timeBase);
-  double frameEndTime = ptsToSeconds(
-      streamInfo.lastDecodedAvFramePts + streamInfo.lastDecodedAvFrameDuration,
-      streamInfo.timeBase);
 
-  TORCH_CHECK(startSeconds > frameEndTime, "OSKOOOOOUUUUUUURRRRRR");
+  auto lastDecodedFrameIsPlayedAtStopSeconds =
+      [this, &streamInfo, stopSeconds]() {
+        auto stopPts = secondsToClosestPts(stopSeconds, streamInfo.timeBase);
+        return (
+            streamInfo.lastDecodedAvFramePts <= stopPts and
+            stopPts <= streamInfo.lastDecodedAvFramePts +
+                    streamInfo.lastDecodedAvFrameDuration);
+      };
 
   setCursorPtsInSeconds(startSeconds);
 
@@ -860,26 +868,19 @@ torch::Tensor VideoDecoder::getFramesPlayedInRangeAudio(
   // sample rate, so in theory we know the number of output samples.
   std::vector<torch::Tensor> tensors;
 
-  while (true) {
-    AVFrameStream avFrameStream = decodeAVFrame([this](AVFrame* avFrame) {
-      StreamInfo& activeStreamInfo = streamInfos_[activeStreamIndex_];
-      return (avFrame->pts >= activeStreamInfo.discardFramesBeforePts) ||
-          (avFrame->pts < activeStreamInfo.discardFramesBeforePts &&
-           activeStreamInfo.discardFramesBeforePts <
-               avFrame->pts + avFrame->duration);
-    });
-    auto frameOutput = convertAVFrameToFrameOutput(avFrameStream);
-    tensors.push_back(frameOutput.data);
-
-    double lastFrameStartPts =
-        ptsToSeconds(streamInfo.lastDecodedAvFramePts, streamInfo.timeBase);
-    double lastFrameEndPts = ptsToSeconds(
-        streamInfo.lastDecodedAvFramePts +
-            streamInfo.lastDecodedAvFrameDuration,
-        streamInfo.timeBase);
-
-    if (lastFrameStartPts <= stopSeconds and stopSeconds <= lastFrameEndPts) {
-      break;
+  bool reachedEOF = false;
+  while (!lastDecodedFrameIsPlayedAtStopSeconds() && !reachedEOF) {
+    try {
+      AVFrameStream avFrameStream =
+          decodeAVFrame([&streamInfo](AVFrame* avFrame) {
+            return (
+                streamInfo.discardFramesBeforePts <
+                avFrame->pts + getDuration(avFrame));
+          });
+      auto frameOutput = convertAVFrameToFrameOutput(avFrameStream);
+      tensors.push_back(frameOutput.data);
+    } catch (const EndOfFileException& e) {
+      reachedEOF = true;
     }
   }
   return torch::cat(tensors, 1);
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -339,7 +339,7 @@ class VideoDecoder {
     // The current position of the cursor in the stream, and associated frame
     // duration.
     int64_t lastDecodedAvFramePts = 0;
-    int64_t lastDecodedAvFrameDuration = -1;
+    int64_t lastDecodedAvFrameDuration = 0;
     // The desired position of the cursor in the stream. We send frames >=
     // this pts to the user when they request a frame.
     // We update this field if the user requested a seek. This typically
diff --git a/test/decoders/test_ops.py b/test/decoders/test_ops.py
@@ -664,12 +664,20 @@ def test_audio_decode_all_samples_with_next(self, asset):
         assert_frames_equal(all_frames, reference_frames)
 
     @pytest.mark.parametrize(
-        "range", ("begin_to_end", "at_frame_boundaries", "not_at_frame_boundaries")
+        "range",
+        (
+            "begin_to_end",
+            "begin_to_beyond_end",
+            "at_frame_boundaries",
+            "not_at_frame_boundaries",
+        ),
     )
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
     def test_get_frames_by_pts_in_range_audio(self, range, asset):
         if range == "begin_to_end":
             start_seconds, stop_seconds = 0, asset.duration_seconds
+        elif range == "begin_to_beyond_end":
+            start_seconds, stop_seconds = 0, asset.duration_seconds + 10
         elif range == "at_frame_boundaries":
             start_seconds = asset.frames[asset.default_stream_index][10].pts_seconds
             stop_seconds = asset.frames[asset.default_stream_index][40].pts_seconds
@@ -687,6 +695,9 @@ def test_get_frames_by_pts_in_range_audio(self, range, asset):
         decoder = create_from_file(str(asset.path), seek_mode="approximate")
         add_audio_stream(decoder)
 
+        # stop_offset logic: if stop_seconds is at a frame boundary i.e. when a
+        # frame starts, then that frame should *not* be included in the output.
+        # Otherwise, it should be part of it, hence why we add 1 to `stop=`.
         stop_offset = 0 if range == "at_frame_boundaries" else 1
         reference_frames = asset.get_frame_data_by_range(
             start=asset.get_frame_index(pts_seconds=start_seconds),
@@ -711,6 +722,20 @@ def test_decode_epsilon_range(self, asset, expected_shape):
         )
         assert frames.shape == expected_shape
 
+    @pytest.mark.parametrize(
+        "asset, expected_shape", ((NASA_AUDIO, (2, 1024)), (NASA_AUDIO_MP3, (2, 576)))
+    )
+    def test_decode_just_one_frame_at_boundaries(self, asset, expected_shape):
+        decoder = create_from_file(str(asset.path), seek_mode="approximate")
+        add_audio_stream(decoder)
+
+        start_seconds = asset.frames[asset.default_stream_index][10].pts_seconds
+        stop_seconds = asset.frames[asset.default_stream_index][11].pts_seconds
+        frames = get_frames_by_pts_in_range_audio(
+            decoder, start_seconds=start_seconds, stop_seconds=stop_seconds
+        )
+        assert frames.shape == expected_shape
+
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
     def test_seek_and_next_audio(self, asset):
         decoder = create_from_file(str(asset.path), seek_mode="approximate")