Merge branch 'main' of github.com:pytorch/torchcodec into encode-twice

NicolasHug · NicolasHug · commit 6e294a91bdcc · 2025-04-30T11:02:42.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -13,10 +13,8 @@ torch::Tensor validateWf(torch::Tensor wf) {
       wf.dtype() == torch::kFloat32,
       "waveform must have float32 dtype, got ",
       wf.dtype());
-  // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
-  // planar (fltp).
   TORCH_CHECK(wf.dim() == 2, "waveform must have 2 dimensions, got ", wf.dim());
-  return wf;
+  return wf.contiguous();
 }
 
 void validateSampleRate(const AVCodec& avCodec, int sampleRate) {
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -878,8 +878,9 @@ AudioFramesOutput SingleStreamDecoder::getFramesPlayedInRangeAudio(
   while (!finished) {
     try {
       UniqueAVFrame avFrame =
-          decodeAVFrame([startPts](const UniqueAVFrame& avFrame) {
-            return startPts < avFrame->pts + getDuration(avFrame);
+          decodeAVFrame([startPts, stopPts](const UniqueAVFrame& avFrame) {
+            return startPts < avFrame->pts + getDuration(avFrame) &&
+                stopPts > avFrame->pts;
           });
       auto frameOutput = convertAVFrameToFrameOutput(avFrame);
       if (!firstFramePtsSeconds.has_value()) {
@@ -908,9 +909,12 @@ AudioFramesOutput SingleStreamDecoder::getFramesPlayedInRangeAudio(
   TORCH_CHECK(
       frames.size() > 0 && firstFramePtsSeconds.has_value(),
       "No audio frames were decoded. ",
-      "This is probably because start_seconds is too high? ",
-      "Current value is ",
-      startSeconds);
+      "This is probably because start_seconds is too high(",
+      startSeconds,
+      "),",
+      "or because stop_seconds(",
+      stopSecondsOptional,
+      ") is too low.");
 
   return AudioFramesOutput{torch::cat(frames, 1), *firstFramePtsSeconds};
 }
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -1112,7 +1112,8 @@ def test_start_equals_stop(self, asset):
 
     def test_frame_start_is_not_zero(self):
         # For NASA_AUDIO_MP3, the first frame is not at 0, it's at 0.138125.
-        # So if we request start = 0.05, we shouldn't be truncating anything.
+        # So if we request (start, stop) = (0.05, None), we shouldn't be
+        # truncating anything.
 
         asset = NASA_AUDIO_MP3
         start_seconds = 0.05  # this is less than the first frame's pts
@@ -1128,6 +1129,35 @@ def test_frame_start_is_not_zero(self):
         reference_frames = asset.get_frame_data_by_range(start=0, stop=stop_frame_index)
         torch.testing.assert_close(samples.data, reference_frames)
 
+        # Non-regression test for https://github.com/pytorch/torchcodec/issues/567
+        # If we ask for start < stop <= first_frame_pts, we should raise.
+        with pytest.raises(RuntimeError, match="No audio frames were decoded"):
+            decoder.get_samples_played_in_range(start_seconds=0, stop_seconds=0.05)
+
+        first_frame_pts_seconds = asset.get_frame_info(idx=0).pts_seconds
+        with pytest.raises(RuntimeError, match="No audio frames were decoded"):
+            decoder.get_samples_played_in_range(
+                start_seconds=0, stop_seconds=first_frame_pts_seconds
+            )
+
+        # Documenting an edge case: we ask for samples barely beyond the start
+        # of the first frame. The C++ decoder returns the first frame, which
+        # gets (correctly!) truncated by the AudioDecoder, and we end up with
+        # empty data.
+        samples = decoder.get_samples_played_in_range(
+            start_seconds=0, stop_seconds=first_frame_pts_seconds + 1e-5
+        )
+        assert samples.data.shape == (2, 0)
+        assert samples.pts_seconds == first_frame_pts_seconds
+        assert samples.duration_seconds == 0
+
+        # if we ask for a little bit more samples, we get non-empty data
+        samples = decoder.get_samples_played_in_range(
+            start_seconds=0, stop_seconds=first_frame_pts_seconds + 1e-3
+        )
+        assert samples.data.shape == (2, 8)
+        assert samples.pts_seconds == first_frame_pts_seconds
+
     def test_single_channel(self):
         asset = SINE_MONO_S32
         decoder = AudioDecoder(asset.path)
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -884,23 +884,6 @@ def test_pts(self, asset):
 
             assert pts_seconds == start_seconds
 
-    def test_decode_before_frame_start(self):
-        # Test illustrating bug described in
-        # https://github.com/pytorch/torchcodec/issues/567
-        asset = NASA_AUDIO_MP3
-
-        decoder = create_from_file(str(asset.path), seek_mode="approximate")
-        add_audio_stream(decoder)
-
-        frames, *_ = get_frames_by_pts_in_range_audio(
-            decoder, start_seconds=0, stop_seconds=0.05
-        )
-        all_frames, *_ = get_frames_by_pts_in_range_audio(
-            decoder, start_seconds=0, stop_seconds=None
-        )
-        # TODO fix this. `frames` should be empty.
-        torch.testing.assert_close(frames, all_frames)
-
     def test_sample_rate_conversion(self):
         def get_all_frames(asset, sample_rate=None, stop_seconds=None):
             decoder = create_from_file(str(asset.path), seek_mode="approximate")
@@ -1284,6 +1267,39 @@ def test_encode_to_tensor_long_output(self):
 
         torch.testing.assert_close(self.decode(encoded_tensor), samples)
 
+    def test_contiguity(self):
+        # Ensure that 2 waveforms with the same values are encoded in the same
+        # way, regardless of their memory layout. Here we encode 2 equal
+        # waveforms, one is row-aligned while the other is column-aligned.
+
+        num_samples = 10_000  # per channel
+        contiguous_samples = torch.rand(2, num_samples).contiguous()
+        assert contiguous_samples.stride() == (num_samples, 1)
+
+        encoded_from_contiguous = encode_audio_to_tensor(
+            wf=contiguous_samples,
+            sample_rate=16_000,
+            format="flac",
+            bit_rate=44_000,
+        )
+        non_contiguous_samples = contiguous_samples.T.contiguous().T
+        assert non_contiguous_samples.stride() == (1, 2)
+
+        torch.testing.assert_close(
+            contiguous_samples, non_contiguous_samples, rtol=0, atol=0
+        )
+
+        encoded_from_non_contiguous = encode_audio_to_tensor(
+            wf=non_contiguous_samples,
+            sample_rate=16_000,
+            format="flac",
+            bit_rate=44_000,
+        )
+
+        torch.testing.assert_close(
+            encoded_from_contiguous, encoded_from_non_contiguous, rtol=0, atol=0
+        )
+
 
 if __name__ == "__main__":
     pytest.main()