WIP

NicolasHug · NicolasHug · commit 0f9e14dd5001 · 2025-03-13T10:13:22.000Z
diff --git a/src/torchcodec/__init__.py b/src/torchcodec/__init__.py
@@ -6,7 +6,7 @@
 
 # Note: usort wants to put Frame and FrameBatch after decoders and samplers,
 # but that results in circular import.
-from ._frame import Frame, FrameBatch  # usort:skip # noqa
+from ._frame import AudioSamples, Frame, FrameBatch  # usort:skip # noqa
 from . import decoders, samplers  # noqa
 
 try:
diff --git a/src/torchcodec/_frame.py b/src/torchcodec/_frame.py
@@ -12,7 +12,7 @@
 
 
 def _frame_repr(self):
-    # Utility to replace Frame and FrameBatch __repr__ method. This prints the
+    # Utility to replace __repr__ method of dataclasses below. This prints the
     # shape of the .data tensor rather than printing the (potentially very long)
     # data tensor itself.
     s = self.__class__.__name__ + ":\n"
@@ -114,3 +114,25 @@ def __len__(self):
 
     def __repr__(self):
         return _frame_repr(self)
+
+@dataclass
+class AudioSamples(Iterable):
+    """Audio samples with associated metadata."""
+    # TODO-AUDIO: docs
+    data: Tensor
+    pts_seconds: float
+    sample_rate: int
+    def __post_init__(self):
+        # This is called after __init__() when a Frame is created. We can run
+        # input validation checks here.
+        if not self.data.ndim == 2:
+            raise ValueError(f"data must be 2-dimensional, got {self.data.shape = }")
+        self.pts_seconds = float(self.pts_seconds)
+        self.sample_rate = int(self.sample_rate)
+
+    def __iter__(self) -> Iterator[Union[Tensor, float]]:
+        for field in dataclasses.fields(self):
+            yield getattr(self, field.name)
+
+    def __repr__(self):
+        return _frame_repr(self)
diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py
@@ -9,6 +9,7 @@
 
 from torch import Tensor
 
+from torchcodec import AudioSamples
 from torchcodec.decoders import _core as core
 from torchcodec.decoders._decoder_utils import (
     create_decoder,
@@ -39,7 +40,7 @@ def __init__(
         )
 
     def get_samples_played_in_range(
-        self, start_seconds: float = 0, stop_seconds: Optional[float] = None
+        self, start_seconds: float, stop_seconds: Optional[float] = None
     ) -> Tensor:
         """TODO-AUDIO docs"""
         if stop_seconds is not None and not start_seconds <= stop_seconds:
@@ -63,26 +64,37 @@ def get_samples_played_in_range(
         #
         #            first_pts                                    last_pts
         #                v                                            v
-        # ....x..........x..........x...........x..........x..........x..........x.....
+        # ....x..........x..........x...........x..........x..........x.....
         #                    ^                                 ^
         #               start_seconds                      stop_seconds
         #
         # We want to return the samples in [start_seconds, stop_seconds). But
         # because the core API is based on frames, the `frames` tensor contains
         # the samples in [first_pts, last_pts)
-        #
         # So we do some basic math to figure out the position of the view that
-        # we'l; return.
+        # we'll return.
 
-        offset_beginning = round(
-            (max(0, start_seconds - first_pts)) * self.metadata.sample_rate
-        )
+        # TODO: sample_rate is either the original one from metadata, or the
+        # user-specified one (NIY)
+        sample_rate = self.metadata.sample_rate
+
+        if first_pts < start_seconds:
+            offset_beginning = round((start_seconds - first_pts) * sample_rate)
+            output_pts_seconds = start_seconds
+        else:
+            offset_beginning = 0
+            output_pts_seconds = first_pts
 
         num_samples = frames.shape[1]
-        offset_end = num_samples
         last_pts = first_pts + num_samples / self.metadata.sample_rate
         if stop_seconds is not None and stop_seconds < last_pts:
-            offset_end -= round((last_pts - stop_seconds) * self.metadata.sample_rate)
+            offset_end = num_samples - round((last_pts - stop_seconds) * sample_rate)
+        else:
+            offset_end = num_samples
+
+        return AudioSamples(
+            data=frames[:, offset_beginning:offset_end],
+            pts_seconds=output_pts_seconds,
+            sample_rate=sample_rate,
+        )
 
-        return frames[:, offset_beginning:offset_end]
-        # return frames[:, offset_beginning:offset_end]
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -147,7 +147,7 @@ class VideoDecoder {
   // DECODING AND SEEKING APIs
   // --------------------------------------------------------------------------
 
-  // All public decoding entry points return either a FrameOutput or a
+  // All public video decoding entry points return either a FrameOutput or a
   // FrameBatchOutput.
   // They are the equivalent of the user-facing Frame and FrameBatch classes in
   // Python. They contain RGB decoded frames along with some associated data
diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py
@@ -955,3 +955,35 @@ def test_metadata(self, asset):
         )
         assert decoder.metadata.sample_rate == asset.sample_rate
         assert decoder.metadata.num_channels == asset.num_channels
+
+    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
+    def test_get_all_samples(self, asset):
+        decoder = AudioDecoder(asset.path)
+        
+        samples = decoder.get_samples_played_in_range(start_seconds=0, stop_seconds=None)
+
+        reference_frames = asset.get_frame_data_by_range(
+            start=0,
+            stop=asset.get_frame_index(pts_seconds=asset.duration_seconds) + 1
+        )
+
+        torch.testing.assert_close(samples.data, reference_frames)
+        assert samples.pts_seconds == asset.get_frame_info(idx=0).pts_seconds
+
+    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
+    def test_get_samples_played_in_range(self, asset):
+        decoder = AudioDecoder(asset.path)
+        
+        start_seconds, stop_seconds = 2, 4
+        samples = decoder.get_samples_played_in_range(start_seconds=start_seconds, stop_seconds=stop_seconds)
+
+        reference_frames = asset.get_frame_data_by_range(
+            start=asset.get_frame_index(pts_seconds=start_seconds),
+            stop=asset.get_frame_index(pts_seconds=stop_seconds) + 1
+        )
+
+        assert samples.pts_seconds == start_seconds
+        num_samples = samples.data.shape[1]
+        assert num_samples < reference_frames.shape[1]
+        assert num_samples == (stop_seconds - start_seconds) * decoder.metadata.sample_rate
+
diff --git a/test/test_frame_dataclasses.py b/test/test_frame_dataclasses.py
@@ -1,10 +1,11 @@
 import pytest
 import torch
-from torchcodec import Frame, FrameBatch
+from torchcodec import Frame, FrameBatch, AudioSamples
 
 
-def test_frame_unpacking():
+def test_unpacking():
     data, pts_seconds, duration_seconds = Frame(torch.rand(3, 4, 5), 2, 3)  # noqa
+    data, pts_seconds, sample_rate = AudioSamples(torch.rand(2, 4), 2, 16_000)
 
 
 def test_frame_error():
@@ -139,3 +140,17 @@ def test_framebatch_indexing():
     fb_fancy = fb[[[0], [1]]]  # select T=0 and N=1.
     assert isinstance(fb_fancy, FrameBatch)
     assert fb_fancy.data.shape == (1, C, H, W)
+
+def test_audio_samples_error():
+    with pytest.raises(ValueError, match="data must be 2-dimensional"):
+        AudioSamples(
+            data=torch.rand(1),
+            pts_seconds=1,
+            sample_rate=16_000,
+        )
+    with pytest.raises(ValueError, match="data must be 2-dimensional"):
+        AudioSamples(
+            data=torch.rand(1, 2, 3),
+            pts_seconds=1,
+            sample_rate=16_000,
+        )