meta-pytorch
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 7 additions & 9 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎test/decoders/test_ops.py‎
Lines changed: 40 additions & 0 deletions b/‎test/decoders/test_ops.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎test/resources/nasa_13013.mp4.audio.mp3.stream0.all_frames.pt‎
864 KB b/‎test/resources/nasa_13013.mp4.audio.mp3.stream0.all_frames.pt‎
864 KB
diff --git a/‎test/resources/nasa_13013.mp4.stream4.all_frames.pt‎
51.9 KB b/‎test/resources/nasa_13013.mp4.stream4.all_frames.pt‎
51.9 KB
diff --git a/‎test/utils.py‎
Lines changed: 22 additions & 28 deletions b/‎test/utils.py‎
Lines changed: 22 additions & 28 deletions
@@ -575,7 +575,7 @@ void VideoDecoder::addAudioStream(int streamIndex) {
   // TODO-AUDIO
   TORCH_CHECK(
       streamMetadata.averageFps.has_value(),
-      "frame_size or sampl_rate aren't known. Cannot decode.");
+      "frame_size or sample_rate aren't known. Cannot decode.");
 
   streamMetadata.sampleRate =
       static_cast<int64_t>(streamInfo.codecContext->sample_rate);
@@ -1311,20 +1311,18 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
   auto numSamples = avFrame->nb_samples; // per channel
   auto numChannels = getNumChannels(avFrame);
 
-  // TODO-AUDIO: dtype should be format-dependent
-  // TODO-AUDIO rename data to something else
-  torch::Tensor data;
+  torch::Tensor outputData;
   if (preAllocatedOutputTensor.has_value()) {
-    data = preAllocatedOutputTensor.value();
+    outputData = preAllocatedOutputTensor.value();
   } else {
-    data = torch::empty({numChannels, numSamples}, torch::kFloat32);
+    outputData = torch::empty({numChannels, numSamples}, torch::kFloat32);
   }
 
   AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
-  // TODO Implement all formats
+  // TODO-AUDIO Implement all formats.
   switch (format) {
     case AV_SAMPLE_FMT_FLTP: {
-      uint8_t* outputChannelData = static_cast<uint8_t*>(data.data_ptr());
+      uint8_t* outputChannelData = static_cast<uint8_t*>(outputData.data_ptr());
       auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
       for (auto channel = 0; channel < numChannels;
            ++channel, outputChannelData += numBytesPerChannel) {
@@ -1341,7 +1339,7 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
           "Unsupported audio format (yet!): ",
           av_get_sample_fmt_name(format));
   }
-  frameOutput.data = data;
+  frameOutput.data = outputData;
 }
 
 // --------------------------------------------------------------------------
 
@@ -40,6 +40,7 @@
     assert_frames_equal,
     cpu_and_cuda,
     NASA_AUDIO,
+    NASA_AUDIO_MP3,
     NASA_VIDEO,
     needs_cuda,
 )
@@ -637,6 +638,45 @@ def test_audio_bad_seek_mode(self):
         ):
             add_audio_stream(decoder)
 
+    def test_audio_decode_all_samples_with_get_frames_by_pts_in_range(self):
+        decoder = create_from_file(str(NASA_AUDIO.path), seek_mode="approximate")
+        add_audio_stream(decoder)
+
+        reference_frames = [
+            NASA_AUDIO.get_frame_data_by_index(i) for i in range(NASA_AUDIO.num_frames)
+        ]
+        reference_frames = torch.stack(
+            reference_frames
+        )  # shape is (num_frames, C, num_samples_per_frame)
+
+        all_frames, *_ = get_frames_by_pts_in_range(
+            decoder, start_seconds=0, stop_seconds=NASA_AUDIO.duration_seconds
+        )
+        assert_frames_equal(all_frames, reference_frames)
+
+    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
+    def test_audio_decode_all_samples_with_next(self, asset):
+        decoder = create_from_file(str(asset.path), seek_mode="approximate")
+        add_audio_stream(decoder)
+
+        reference_frames = [
+            asset.get_frame_data_by_index(i) for i in range(asset.num_frames)
+        ]
+
+        # shape is (C, num_frames * num_samples_per_frame) while preserving frame order and boundaries
+        reference_frames = torch.cat(reference_frames, dim=-1)
+
+        all_frames = []
+        while True:
+            try:
+                frame, *_ = get_next_frame(decoder)
+                all_frames.append(frame)
+            except IndexError:
+                break
+        all_frames = torch.cat(all_frames, axis=-1)
+
+        assert_frames_equal(all_frames, reference_frames)
+
     @pytest.mark.parametrize(
         "start_seconds, stop_seconds",
         (
 
@@ -3,8 +3,8 @@
 import pathlib
 import sys
 
-from dataclasses import dataclass
-from typing import Dict, Optional
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
 
 import numpy as np
 import pytest
@@ -203,6 +203,8 @@ class TestVideoStreamInfo:
 
 @dataclass
 class TestVideo(TestContainerFile):
+    """Base class for the *video* streams of a video container"""
+
     stream_infos: Dict[int, TestVideoStreamInfo]
 
     def get_frame_data_by_index(
@@ -318,13 +320,16 @@ class TestAudioStreamInfo:
     sample_rate: int
     num_channels: int
     duration_seconds: float
+    num_frames: int
 
 
 @dataclass
 class TestAudio(TestContainerFile):
+    """Base class for the *audio* streams of a container (potentially a video),
+    or a pure audio file"""
 
     stream_infos: Dict[int, TestAudioStreamInfo]
-    _reference_frames: tuple[torch.Tensor] = tuple()
+    _reference_frames: Dict[int, List[torch.Tensor]] = field(default_factory=dict)
 
     # Storing each individual frame is too expensive for audio, because there's
     # a massive overhead in the binary format saved by pytorch. Saving all the
@@ -333,32 +338,22 @@ class TestAudio(TestContainerFile):
     # So we store the reference frames in a single file, and load/cache those
     # when the TestAudio instance is created.
     def __post_init__(self):
-        # We hard-code the default stream index, see TODO below.
-        file_path = _get_file_path(
-            f"{self.filename}.stream{self.default_stream_index}.all_frames.pt"
-        )
-        if not file_path.exists():
-            return  # TODO-audio
-        t = torch.load(file_path, weights_only=True)
+        for stream_index in self.stream_infos:
+            file_path = _get_file_path(
+                f"{self.filename}.stream{stream_index}.all_frames.pt"
+            )
 
-        # These are hard-coded value assuming stream 4 of nasa_13013.mp4. Each
-        # of the 204 frames contains 1024 samples.
-        # TODO make this more generic
-        assert t.shape == (2, 204 * 1024)
-        self._reference_frames = torch.chunk(t, chunks=204, dim=1)
+            self._reference_frames[stream_index] = torch.load(
+                file_path, weights_only=True
+            )
 
     def get_frame_data_by_index(
         self, idx: int, *, stream_index: Optional[int] = None
     ) -> torch.Tensor:
-        if stream_index is not None and stream_index != self.default_stream_index:
-            # TODO address this, the fix should be to let _reference_frames be a
-            # dict[tuple[torch.Tensor]] where keys are stream indices, and load
-            # all of those indices in __post_init__.
-            raise ValueError(
-                "Can only use default stream index with TestAudio for now."
-            )
+        if stream_index is None:
+            stream_index = self.default_stream_index
 
-        return self._reference_frames[idx]
+        return self._reference_frames[stream_index][idx]
 
     def pts_to_frame_index(self, pts_seconds: float) -> int:
         # These are hard-coded value assuming stream 4 of nasa_13013.mp4. Each
@@ -379,10 +374,9 @@ def num_channels(self) -> int:
     def duration_seconds(self) -> float:
         return self.stream_infos[self.default_stream_index].duration_seconds
 
-    # TODO: this shouldn't be named chw. Also values are hard-coded
     @property
-    def empty_chw_tensor(self) -> torch.Tensor:
-        return torch.empty([0, 2, 1024], dtype=torch.float32)
+    def num_frames(self) -> int:
+        return self.stream_infos[self.default_stream_index].num_frames
 
 
 NASA_AUDIO_MP3 = TestAudio(
@@ -391,7 +385,7 @@ def empty_chw_tensor(self) -> torch.Tensor:
     frames={},  # TODO
     stream_infos={
         0: TestAudioStreamInfo(
-            sample_rate=8_000, num_channels=2, duration_seconds=13.248
+            sample_rate=8_000, num_channels=2, duration_seconds=13.248, num_frames=183
         )
     },
 )
@@ -402,7 +396,7 @@ def empty_chw_tensor(self) -> torch.Tensor:
     frames={},  # TODO
     stream_infos={
         4: TestAudioStreamInfo(
-            sample_rate=16_000, num_channels=2, duration_seconds=13.056
+            sample_rate=16_000, num_channels=2, duration_seconds=13.056, num_frames=204
         )
     },
 )