WIP

NicolasHug · NicolasHug · commit a7b67d54adde · 2025-03-13T11:55:21.000Z
diff --git a/src/torchcodec/_frame.py b/src/torchcodec/_frame.py
@@ -115,13 +115,16 @@ def __len__(self):
     def __repr__(self):
         return _frame_repr(self)
 
+
 @dataclass
 class AudioSamples(Iterable):
     """Audio samples with associated metadata."""
+
     # TODO-AUDIO: docs
     data: Tensor
     pts_seconds: float
     sample_rate: int
+
     def __post_init__(self):
         # This is called after __init__() when a Frame is created. We can run
         # input validation checks here.
@@ -135,4 +138,4 @@ def __iter__(self) -> Iterator[Union[Tensor, float]]:
             yield getattr(self, field.name)
 
     def __repr__(self):
-        return _frame_repr(self)
+        return _frame_repr(self)
diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py
@@ -82,6 +82,9 @@ def get_samples_played_in_range(
             offset_beginning = round((start_seconds - first_pts) * sample_rate)
             output_pts_seconds = start_seconds
         else:
+            # In normal cases we'll have first_pts <= start_pts, but in some
+            # edge cases it's possible to have first_pts > start_seconds,
+            # typically if the stream's first frame's pts isn't exactly 0.
             offset_beginning = 0
             output_pts_seconds = first_pts
 
@@ -97,4 +100,3 @@ def get_samples_played_in_range(
             pts_seconds=output_pts_seconds,
             sample_rate=sample_rate,
         )
-
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -854,7 +854,7 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
 
   if (startSeconds == stopSeconds) {
     // For consistency with video
-    return AudioFramesOutput{torch::empty({0}), 0.0};
+    return AudioFramesOutput{torch::empty({0, 0}), 0.0};
   }
 
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py
@@ -957,33 +957,124 @@ def test_metadata(self, asset):
         assert decoder.metadata.num_channels == asset.num_channels
 
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
-    def test_get_all_samples(self, asset):
+    def test_error(self, asset):
         decoder = AudioDecoder(asset.path)
-        
-        samples = decoder.get_samples_played_in_range(start_seconds=0, stop_seconds=None)
+
+        with pytest.raises(ValueError, match="Invalid start seconds"):
+            decoder.get_samples_played_in_range(start_seconds=-1300)
+
+        with pytest.raises(ValueError, match="Invalid start seconds"):
+            decoder.get_samples_played_in_range(start_seconds=9999)
+
+        with pytest.raises(ValueError, match="Invalid start seconds"):
+            decoder.get_samples_played_in_range(start_seconds=3, stop_seconds=2)
+
+    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
+    @pytest.mark.parametrize("stop_seconds", (None, "duration", 99999999))
+    def test_get_all_samples(self, asset, stop_seconds):
+        decoder = AudioDecoder(asset.path)
+
+        if stop_seconds == "duration":
+            stop_seconds = asset.duration_seconds
+
+        samples = decoder.get_samples_played_in_range(
+            start_seconds=0, stop_seconds=stop_seconds
+        )
 
         reference_frames = asset.get_frame_data_by_range(
-            start=0,
-            stop=asset.get_frame_index(pts_seconds=asset.duration_seconds) + 1
+            start=0, stop=asset.get_frame_index(pts_seconds=asset.duration_seconds) + 1
         )
 
         torch.testing.assert_close(samples.data, reference_frames)
-        assert samples.pts_seconds == asset.get_frame_info(idx=0).pts_seconds
+        assert samples.sample_rate == asset.sample_rate
+
+        # TODO there's a bug with NASA_AUDIO_MP3: https://github.com/pytorch/torchcodec/issues/553
+        expected_pts = (
+            0.072
+            if asset is NASA_AUDIO_MP3
+            else asset.get_frame_info(idx=0).pts_seconds
+        )
+        assert samples.pts_seconds == expected_pts
 
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
-    def test_get_samples_played_in_range(self, asset):
+    def test_at_frame_boundaries(self, asset):
         decoder = AudioDecoder(asset.path)
-        
-        start_seconds, stop_seconds = 2, 4
-        samples = decoder.get_samples_played_in_range(start_seconds=start_seconds, stop_seconds=stop_seconds)
+
+        start_frame_index, stop_frame_index = 10, 40
+        start_seconds = asset.get_frame_info(start_frame_index).pts_seconds
+        stop_seconds = asset.get_frame_info(stop_frame_index).pts_seconds
+
+        samples = decoder.get_samples_played_in_range(
+            start_seconds=start_seconds, stop_seconds=stop_seconds
+        )
 
         reference_frames = asset.get_frame_data_by_range(
-            start=asset.get_frame_index(pts_seconds=start_seconds),
-            stop=asset.get_frame_index(pts_seconds=stop_seconds) + 1
+            start=start_frame_index, stop=stop_frame_index
+        )
+
+        assert samples.pts_seconds == start_seconds
+        num_samples = samples.data.shape[1]
+        assert (
+            num_samples
+            == reference_frames.shape[1]
+            == (stop_seconds - start_seconds) * decoder.metadata.sample_rate
+        )
+        torch.testing.assert_close(samples.data, reference_frames)
+        assert samples.sample_rate == asset.sample_rate
+
+    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
+    def test_not_at_frame_boundaries(self, asset):
+        decoder = AudioDecoder(asset.path)
+
+        start_frame_index, stop_frame_index = 10, 40
+        start_frame_info = asset.get_frame_info(start_frame_index)
+        stop_frame_info = asset.get_frame_info(stop_frame_index)
+        start_seconds = start_frame_info.pts_seconds + (
+            start_frame_info.duration_seconds / 2
+        )
+        stop_seconds = stop_frame_info.pts_seconds + (
+            stop_frame_info.duration_seconds / 2
+        )
+        samples = decoder.get_samples_played_in_range(
+            start_seconds=start_seconds, stop_seconds=stop_seconds
+        )
+
+        reference_frames = asset.get_frame_data_by_range(
+            start=start_frame_index, stop=stop_frame_index + 1
         )
 
         assert samples.pts_seconds == start_seconds
         num_samples = samples.data.shape[1]
         assert num_samples < reference_frames.shape[1]
-        assert num_samples == (stop_seconds - start_seconds) * decoder.metadata.sample_rate
+        assert (
+            num_samples == (stop_seconds - start_seconds) * decoder.metadata.sample_rate
+        )
+        assert samples.sample_rate == asset.sample_rate
+
+    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
+    def test_start_equals_stop(self, asset):
+        decoder = AudioDecoder(asset.path)
+        samples = decoder.get_samples_played_in_range(start_seconds=3, stop_seconds=3)
+        assert samples.data.shape == (0, 0)
+
+    def test_frame_start_is_not_zero(self):
+        # For NASA_AUDIO_MP3, the first frame is not at 0, it's at 0.072 [1].
+        # So if we request start = 0.05, we shouldn't be truncating anything.
+        #
+        # [1] well, really it's at 0.138125, not 0.072 (see
+        # https://github.com/pytorch/torchcodec/issues/553), but for the purpose
+        # of this test it doesn't matter.
+
+        asset = NASA_AUDIO_MP3
+        start_seconds = 0.05  # this is less than the first frame's pts
+        stop_frame_index = 10
+        stop_seconds = asset.get_frame_info(stop_frame_index).pts_seconds
 
+        decoder = AudioDecoder(asset.path)
+
+        samples = decoder.get_samples_played_in_range(
+            start_seconds=start_seconds, stop_seconds=stop_seconds
+        )
+
+        reference_frames = asset.get_frame_data_by_range(start=0, stop=stop_frame_index)
+        torch.testing.assert_close(samples.data, reference_frames)
diff --git a/test/decoders/test_ops.py b/test/decoders/test_ops.py
@@ -742,7 +742,7 @@ def test_decode_start_equal_stop(self, asset):
         frames, pts_seconds = get_frames_by_pts_in_range_audio(
             decoder, start_seconds=1, stop_seconds=1
         )
-        assert frames.shape == (0,)
+        assert frames.shape == (0, 0)
         assert pts_seconds == 0
 
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
diff --git a/test/test_frame_dataclasses.py b/test/test_frame_dataclasses.py
@@ -1,6 +1,6 @@
 import pytest
 import torch
-from torchcodec import Frame, FrameBatch, AudioSamples
+from torchcodec import AudioSamples, Frame, FrameBatch
 
 
 def test_unpacking():
@@ -141,6 +141,7 @@ def test_framebatch_indexing():
     assert isinstance(fb_fancy, FrameBatch)
     assert fb_fancy.data.shape == (1, C, H, W)
 
+
 def test_audio_samples_error():
     with pytest.raises(ValueError, match="data must be 2-dimensional"):
         AudioSamples(
@@ -153,4 +154,4 @@ def test_audio_samples_error():
             data=torch.rand(1, 2, 3),
             pts_seconds=1,
             sample_rate=16_000,
-        )
+        )

Original file line number	Diff line number	Diff line change
`@@ -854,7 +854,7 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(`
`854`	`854`
`855`	`855`	`if (startSeconds == stopSeconds) {`
`856`	`856`	`// For consistency with video`
`857`		`- return AudioFramesOutput{torch::empty({0}), 0.0};`
	`857`	`+ return AudioFramesOutput{torch::empty({0, 0}), 0.0};`
`858`	`858`	`}`
`859`	`859`
`860`	`860`	`StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];`
Original file line number	Diff line number	Diff line change
`@@ -742,7 +742,7 @@ def test_decode_start_equal_stop(self, asset):`
`742`	`742`	`frames, pts_seconds = get_frames_by_pts_in_range_audio(`
`743`	`743`	`decoder, start_seconds=1, stop_seconds=1`
`744`	`744`	`)`
`745`		`- assert frames.shape == (0,)`
	`745`	`+ assert frames.shape == (0, 0)`
`746`	`746`	`assert pts_seconds == 0`
`747`	`747`
`748`	`748`	`@pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))`