Merge branch 'main' of github.com:pytorch/torchcodec into migrate_encoding_test

NicolasHug · NicolasHug · commit 96e5e6014514 · 2025-05-22T15:21:54.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -179,6 +179,8 @@ void AudioEncoder::initializeEncoder(
 
   desiredNumChannels_ = static_cast<int>(numChannels.value_or(wf_.sizes()[0]));
   validateNumChannels(*avCodec, desiredNumChannels_);
+  // The avCodecContext layout defines the layout of the encoded output, it's
+  // not related to the input sampes.
   setDefaultChannelLayout(avCodecContext_, desiredNumChannels_);
 
   validateSampleRate(*avCodec, sampleRate);
@@ -233,6 +235,8 @@ void AudioEncoder::encode() {
   avFrame->format = AV_SAMPLE_FMT_FLTP;
   avFrame->sample_rate = avCodecContext_->sample_rate;
   avFrame->pts = 0;
+  // We set the channel layout of the frame to the default layout corresponding
+  // to the input samples' number of channels
   setDefaultChannelLayout(avFrame, static_cast<int>(wf_.sizes()[0]));
 
   auto status = av_frame_get_buffer(avFrame.get(), 0);
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -107,15 +107,17 @@ void validateNumChannels(const AVCodec& avCodec, int numChannels) {
     // eventually raise.
     return;
   }
-  for (auto i = 0; avCodec.ch_layouts[i].order != AV_CHANNEL_ORDER_UNSPEC;
-       ++i) {
+  // FFmpeg doc indicate that the ch_layouts array is terminated by a zeroed
+  // layout, so checking for nb_channels == 0 should indicate its end.
+  for (auto i = 0; avCodec.ch_layouts[i].nb_channels != 0; ++i) {
     if (numChannels == avCodec.ch_layouts[i].nb_channels) {
       return;
     }
   }
+  // At this point it seems that the encoder doesn't support the requested
+  // number of channels, so we error out.
   std::stringstream supportedNumChannels;
-  for (auto i = 0; avCodec.ch_layouts[i].order != AV_CHANNEL_ORDER_UNSPEC;
-       ++i) {
+  for (auto i = 0; avCodec.ch_layouts[i].nb_channels != 0; ++i) {
     if (i > 0) {
       supportedNumChannels << ", ";
     }
@@ -132,6 +134,8 @@ void validateNumChannels(const AVCodec& avCodec, int numChannels) {
       return;
     }
   }
+  // At this point it seems that the encoder doesn't support the requested
+  // number of channels, so we error out.
   std::stringstream supportedNumChannels;
   for (auto i = 0; avCodec.channel_layouts[i] != 0; ++i) {
     if (i > 0) {
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -6,6 +6,7 @@
 
 import io
 import os
+import re
 from functools import partial
 
 os.environ["TORCH_LOGS"] = "output_code"
@@ -1119,6 +1120,239 @@ def test_bad_input(self, tmp_path):
                 wf=torch.rand(2, 10), sample_rate=10, filename="./file.bad_extension"
             )
 
+        with pytest.raises(RuntimeError, match="invalid sample rate=10"):
+            encode_audio_to_file(
+                wf=self.decode(NASA_AUDIO_MP3),
+                sample_rate=10,
+                filename=valid_output_file,
+            )
+        with pytest.raises(RuntimeError, match="bit_rate=-1 must be >= 0"):
+            encode_audio_to_file(
+                wf=self.decode(NASA_AUDIO_MP3),
+                sample_rate=NASA_AUDIO_MP3.sample_rate,
+                filename=valid_output_file,
+                bit_rate=-1,  # bad
+            )
+
+        with pytest.raises(RuntimeError, match="Trying to encode 10 channels"):
+            encode_audio_to_file(
+                wf=torch.rand(10, 20), sample_rate=10, filename="doesnt_matter"
+            )
+
+        for num_channels in (0, 3):
+            with pytest.raises(
+                RuntimeError,
+                match=re.escape(
+                    f"Desired number of channels ({num_channels}) is not supported"
+                ),
+            ):
+                encode_audio_to_file(
+                    wf=torch.rand(2, 10),
+                    sample_rate=16_000,
+                    filename="ok.mp3",
+                    num_channels=num_channels,
+                )
+
+    @pytest.mark.parametrize(
+        "encode_method", (encode_audio_to_file, encode_audio_to_tensor)
+    )
+    @pytest.mark.parametrize("output_format", ("wav", "flac"))
+    def test_round_trip(self, encode_method, output_format, tmp_path):
+        # Check that decode(encode(samples)) == samples on lossless formats
+
+        if get_ffmpeg_major_version() == 4 and output_format == "wav":
+            pytest.skip("Swresample with FFmpeg 4 doesn't work on wav files")
+
+        asset = NASA_AUDIO_MP3
+        source_samples = self.decode(asset)
+
+        if encode_method is encode_audio_to_file:
+            encoded_path = tmp_path / f"output.{output_format}"
+            encode_audio_to_file(
+                wf=source_samples,
+                sample_rate=asset.sample_rate,
+                filename=str(encoded_path),
+            )
+            encoded_source = encoded_path
+        else:
+            encoded_source = encode_audio_to_tensor(
+                wf=source_samples, sample_rate=asset.sample_rate, format=output_format
+            )
+            assert encoded_source.dtype == torch.uint8
+            assert encoded_source.ndim == 1
+
+        rtol, atol = (0, 1e-4) if output_format == "wav" else (None, None)
+        torch.testing.assert_close(
+            self.decode(encoded_source), source_samples, rtol=rtol, atol=atol
+        )
+
+    @pytest.mark.skipif(in_fbcode(), reason="TODO: enable ffmpeg CLI")
+    @pytest.mark.parametrize("asset", (NASA_AUDIO_MP3, SINE_MONO_S32))
+    @pytest.mark.parametrize("bit_rate", (None, 0, 44_100, 999_999_999))
+    @pytest.mark.parametrize("num_channels", (None, 1, 2))
+    @pytest.mark.parametrize("output_format", ("mp3", "wav", "flac"))
+    def test_against_cli(self, asset, bit_rate, num_channels, output_format, tmp_path):
+        # Encodes samples with our encoder and with the FFmpeg CLI, and checks
+        # that both decoded outputs are equal
+
+        if get_ffmpeg_major_version() == 4 and output_format == "wav":
+            pytest.skip("Swresample with FFmpeg 4 doesn't work on wav files")
+
+        encoded_by_ffmpeg = tmp_path / f"ffmpeg_output.{output_format}"
+        subprocess.run(
+            ["ffmpeg", "-i", str(asset.path)]
+            + (["-b:a", f"{bit_rate}"] if bit_rate is not None else [])
+            + (["-ac", f"{num_channels}"] if num_channels is not None else [])
+            + [
+                str(encoded_by_ffmpeg),
+            ],
+            capture_output=True,
+            check=True,
+        )
+
+        encoded_by_us = tmp_path / f"our_output.{output_format}"
+        encode_audio_to_file(
+            wf=self.decode(asset),
+            sample_rate=asset.sample_rate,
+            filename=str(encoded_by_us),
+            bit_rate=bit_rate,
+            num_channels=num_channels,
+        )
+
+        if output_format == "wav":
+            rtol, atol = 0, 1e-4
+        elif output_format == "mp3" and asset is SINE_MONO_S32 and num_channels == 2:
+            # Not sure why, this one needs slightly higher tol. With default
+            # tolerances, the check fails on ~1% of the samples, so that's
+            # probably fine. It might be that the FFmpeg CLI doesn't rely on
+            # libswresample for converting channels?
+            rtol, atol = 0, 1e-3
+        else:
+            rtol, atol = None, None
+        torch.testing.assert_close(
+            self.decode(encoded_by_ffmpeg),
+            self.decode(encoded_by_us),
+            rtol=rtol,
+            atol=atol,
+        )
+
+    @pytest.mark.parametrize("asset", (NASA_AUDIO_MP3, SINE_MONO_S32))
+    @pytest.mark.parametrize("bit_rate", (None, 0, 44_100, 999_999_999))
+    @pytest.mark.parametrize("num_channels", (None, 1, 2))
+    @pytest.mark.parametrize("output_format", ("mp3", "wav", "flac"))
+    def test_tensor_against_file(
+        self, asset, bit_rate, num_channels, output_format, tmp_path
+    ):
+        if get_ffmpeg_major_version() == 4 and output_format == "wav":
+            pytest.skip("Swresample with FFmpeg 4 doesn't work on wav files")
+
+        encoded_file = tmp_path / f"our_output.{output_format}"
+        encode_audio_to_file(
+            wf=self.decode(asset),
+            sample_rate=asset.sample_rate,
+            filename=str(encoded_file),
+            bit_rate=bit_rate,
+            num_channels=num_channels,
+        )
+
+        encoded_tensor = encode_audio_to_tensor(
+            wf=self.decode(asset),
+            sample_rate=asset.sample_rate,
+            format=output_format,
+            bit_rate=bit_rate,
+            num_channels=num_channels,
+        )
+
+        torch.testing.assert_close(
+            self.decode(encoded_file), self.decode(encoded_tensor)
+        )
+
+    def test_encode_to_tensor_long_output(self):
+        # Check that we support re-allocating the output tensor when the encoded
+        # data is large.
+        samples = torch.rand(1, int(1e7))
+        encoded_tensor = encode_audio_to_tensor(
+            wf=samples,
+            sample_rate=16_000,
+            format="flac",
+            bit_rate=44_000,
+        )
+        # Note: this should be in sync with its C++ counterpart for the test to
+        # be meaningful.
+        INITIAL_TENSOR_SIZE = 10_000_000
+        assert encoded_tensor.numel() > INITIAL_TENSOR_SIZE
+
+        torch.testing.assert_close(self.decode(encoded_tensor), samples)
+
+    def test_contiguity(self):
+        # Ensure that 2 waveforms with the same values are encoded in the same
+        # way, regardless of their memory layout. Here we encode 2 equal
+        # waveforms, one is row-aligned while the other is column-aligned.
+
+        num_samples = 10_000  # per channel
+        contiguous_samples = torch.rand(2, num_samples).contiguous()
+        assert contiguous_samples.stride() == (num_samples, 1)
+
+        encoded_from_contiguous = encode_audio_to_tensor(
+            wf=contiguous_samples,
+            sample_rate=16_000,
+            format="flac",
+            bit_rate=44_000,
+        )
+        non_contiguous_samples = contiguous_samples.T.contiguous().T
+        assert non_contiguous_samples.stride() == (1, 2)
+
+        torch.testing.assert_close(
+            contiguous_samples, non_contiguous_samples, rtol=0, atol=0
+        )
+
+        encoded_from_non_contiguous = encode_audio_to_tensor(
+            wf=non_contiguous_samples,
+            sample_rate=16_000,
+            format="flac",
+            bit_rate=44_000,
+        )
+
+        torch.testing.assert_close(
+            encoded_from_contiguous, encoded_from_non_contiguous, rtol=0, atol=0
+        )
+
+    @pytest.mark.parametrize("num_channels_input", (1, 2))
+    @pytest.mark.parametrize("num_channels_output", (1, 2, None))
+    @pytest.mark.parametrize(
+        "encode_method", (encode_audio_to_file, encode_audio_to_tensor)
+    )
+    def test_num_channels(
+        self, num_channels_input, num_channels_output, encode_method, tmp_path
+    ):
+        # We just check that the num_channels parameter is respected.
+        # Correctness is checked in other tests (like test_against_cli())
+
+        sample_rate = 16_000
+        source_samples = torch.rand(num_channels_input, 1_000)
+        format = "mp3"
+
+        if encode_method is encode_audio_to_file:
+            encoded_path = tmp_path / f"output.{format}"
+            encode_audio_to_file(
+                wf=source_samples,
+                sample_rate=sample_rate,
+                filename=str(encoded_path),
+                num_channels=num_channels_output,
+            )
+            encoded_source = encoded_path
+        else:
+            encoded_source = encode_audio_to_tensor(
+                wf=source_samples,
+                sample_rate=sample_rate,
+                format=format,
+                num_channels=num_channels_output,
+            )
+
+        if num_channels_output is None:
+            num_channels_output = num_channels_input
+        assert self.decode(encoded_source).shape[0] == num_channels_output
+
 
 if __name__ == "__main__":
     pytest.main()

Original file line number	Diff line number	Diff line change
`@@ -107,15 +107,17 @@ void validateNumChannels(const AVCodec& avCodec, int numChannels) {`
`107`	`107`	`// eventually raise.`
`108`	`108`	`return;`
`109`	`109`	`}`
`110`		`- for (auto i = 0; avCodec.ch_layouts[i].order != AV_CHANNEL_ORDER_UNSPEC;`
`111`		`- ++i) {`
	`110`	`+ // FFmpeg doc indicate that the ch_layouts array is terminated by a zeroed`
	`111`	`+ // layout, so checking for nb_channels == 0 should indicate its end.`
	`112`	`+ for (auto i = 0; avCodec.ch_layouts[i].nb_channels != 0; ++i) {`
`112`	`113`	`if (numChannels == avCodec.ch_layouts[i].nb_channels) {`
`113`	`114`	`return;`
`114`	`115`	`}`
`115`	`116`	`}`
	`117`	`+ // At this point it seems that the encoder doesn't support the requested`
	`118`	`+ // number of channels, so we error out.`
`116`	`119`	`std::stringstream supportedNumChannels;`
`117`		`- for (auto i = 0; avCodec.ch_layouts[i].order != AV_CHANNEL_ORDER_UNSPEC;`
`118`		`- ++i) {`
	`120`	`+ for (auto i = 0; avCodec.ch_layouts[i].nb_channels != 0; ++i) {`
`119`	`121`	`if (i > 0) {`
`120`	`122`	`supportedNumChannels << ", ";`
`121`	`123`	`}`
`@@ -132,6 +134,8 @@ void validateNumChannels(const AVCodec& avCodec, int numChannels) {`
`132`	`134`	`return;`
`133`	`135`	`}`
`134`	`136`	`}`
	`137`	`+ // At this point it seems that the encoder doesn't support the requested`
	`138`	`+ // number of channels, so we error out.`
`135`	`139`	`std::stringstream supportedNumChannels;`
`136`	`140`	`for (auto i = 0; avCodec.channel_layouts[i] != 0; ++i) {`
`137`	`141`	`if (i > 0) {`