Use soundfile for mp3 decoding instead of torchaudio (#5573)

Polina Kazakova · mariosasko · web-flow · commit f96547708a88 · 2023-02-28T21:16:02.000+01:00
* use soundfile for mp3 decoding instead of torchaudio

* fix some tests

* remove torch and torchaudio from library's requirements

* refactor audio decoding, decode everything with soundfile

* remove torchaudio latest test ci stage, remove libsndfile and sox binaries installation

* remove checks for libsndfile in tests since it's bundeled in python library

* remove instructions about installing via package manager since it's misleading

* pin soundfile version to the latest

* update documentation

* fix setup

* Update docs/source/installation.md

Co-authored-by: Mario Šaško &lt;mariosasko777@gmail.com&gt;

* refactor decoding: move all the code under the main decode_example func

* get audio format with os.path instead of string split

* add module config variables for opus and mp3 support

* apply steven's suggestion to installation docs

* wrap torch.from_numpy in a func to avoid torch.from_numpy pickling error

* Apply suggestions from code review

Co-authored-by: Mario Šaško &lt;mariosasko777@gmail.com&gt;

* fix code style

* import xsplitext

---------

Co-authored-by: Mario Šaško &lt;mariosasko777@gmail.com&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -40,11 +40,6 @@ jobs:
     continue-on-error: ${{ matrix.test == 'integration' }}
     runs-on: ${{ matrix.os }}
     steps:
-      - name: Install OS dependencies
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          sudo apt-get -y update
-          sudo apt-get -y install libsndfile1 sox
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
@@ -72,16 +67,6 @@ jobs:
       - name: Test with pytest
         run: |
           python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
-      - name: Install dependencies to test torchaudio>=0.12 on Ubuntu
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          pip uninstall -y torchaudio torch
-          pip install "torchaudio>=0.12"
-          sudo apt-get -y install ffmpeg
-      - name: Test torchaudio>=0.12 on Ubuntu
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          python -m pytest -rfExX -m torchaudio_latest -n 2 --dist loadfile -sv ./tests/features/test_audio.py
 
   test_py310:
     needs: check_code_quality
@@ -93,11 +78,6 @@ jobs:
     continue-on-error: false
     runs-on: ${{ matrix.os }}
     steps:
-      - name: Install OS dependencies
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          sudo apt-get -y update
-          sudo apt-get -y install libsndfile1 sox
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
@@ -112,13 +92,3 @@ jobs:
       - name: Test with pytest
         run: |
           python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
-      - name: Install dependencies to test torchaudio>=0.12 on Ubuntu
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          pip uninstall -y torchaudio torch
-          pip install "torchaudio>=0.12"
-          sudo apt-get -y install ffmpeg
-      - name: Test torchaudio>=0.12 on Ubuntu
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          python -m pytest -rfExX -m torchaudio_latest -n 2 --dist loadfile -sv ./tests/features/test_audio.py
diff --git a/docs/source/audio_load.mdx b/docs/source/audio_load.mdx
@@ -1,7 +1,7 @@
 # Load audio data
 
 You can load an audio dataset using the [`Audio`] feature that automatically decodes and resamples the audio files when you access the examples.
-Audio decoding is based on `librosa` in general, and `torchaudio` for MP3.
+Audio decoding is based on the [`soundfile`](https://github.com/bastibe/python-soundfile) python package, which uses the [`libsndfile`](https://github.com/libsndfile/libsndfile) C library under the hood.
 
 ## Installation
 
diff --git a/docs/source/installation.md b/docs/source/installation.md
@@ -67,31 +67,15 @@ pip install datasets[audio]
 
 <Tip warning={true}>
 
-On Linux, non-Python dependency on `libsndfile` package must be installed manually, using your distribution package manager, for example:
+To decode mp3 files, you need to have at least version 1.1.0 of the `libsndfile` system library. Usually, it's bundled with the python [`soundfile`](https://github.com/bastibe/python-soundfile) package, which is installed as an extra audio dependency for 🤗 Datasets.
+For Linux, the required version of `libsndfile` is bundled with `soundfile` starting from version 0.12.0. You can run the following command to determine which version of `libsndfile` is being used by `soundfile`:
 
 ```bash
-sudo apt-get install libsndfile1
+python -c "import soundfile; print(soundfile.__libsndfile_version__)"
 ```
 
 </Tip>
 
-To support loading audio datasets containing MP3 files, users should also install [torchaudio](https://pytorch.org/audio/stable/index.html) to handle the audio data with high performance:
-
-```bash
-pip install 'torchaudio<0.12.0'
-```
-
-<Tip warning={true}>
-
-torchaudio's `sox_io` [backend](https://pytorch.org/audio/stable/backend.html#) supports decoding MP3 files. Unfortunately, the `sox_io` backend is only available on Linux/macOS and isn't supported by Windows.
-
-You need to install it using your distribution package manager, for example:
-
-```bash
-sudo apt-get install sox
-```
-
-</Tip>
 
 ## Vision
 
diff --git a/setup.py b/setup.py
@@ -142,6 +142,7 @@
 ]
 
 AUDIO_REQUIRE = [
+    "soundfile>=0.12.1",
     "librosa",
 ]
 
@@ -176,8 +177,7 @@
     "tensorflow-macos; sys_platform == 'darwin' and platform_machine == 'arm64'",
     "tiktoken;python_version>='3.8'",
     "torch",
-    "torchaudio<0.12.0",
-    "soundfile",
+    "soundfile>=0.12.1",
     "transformers",
     "zstandard",
 ]
diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -130,7 +130,12 @@
 
 # Optional tools for feature decoding
 PIL_AVAILABLE = importlib.util.find_spec("PIL") is not None
-
+IS_OPUS_SUPPORTED = importlib.util.find_spec("soundfile") is not None and version.parse(
+    importlib.import_module("soundfile").__libsndfile_version__
+) >= version.parse("1.0.31")
+IS_MP3_SUPPORTED = importlib.util.find_spec("soundfile") is not None and version.parse(
+    importlib.import_module("soundfile").__libsndfile_version__
+) >= version.parse("1.1.0")
 
 # Optional compression tools
 RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None
diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
@@ -1,15 +1,13 @@
 import os
-import warnings
 from dataclasses import dataclass, field
 from io import BytesIO
 from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union
 
 import numpy as np
 import pyarrow as pa
-from packaging import version
 
 from .. import config
-from ..download.streaming_download_manager import xopen
+from ..download.streaming_download_manager import xopen, xsplitext
 from ..table import array_cast
 from ..utils.py_utils import no_op_if_value_is_null, string_to_dict
 
@@ -150,20 +148,47 @@ def decode_example(
         path, file = (value["path"], BytesIO(value["bytes"])) if value["bytes"] is not None else (value["path"], None)
         if path is None and file is None:
             raise ValueError(f"An audio sample should have one of 'path' or 'bytes' but both are None in {value}.")
-        elif path is not None and path.endswith("mp3"):
-            array, sampling_rate = self._decode_mp3(file if file else path)
-        elif path is not None and path.endswith("opus"):
-            if file:
-                array, sampling_rate = self._decode_non_mp3_file_like(file, "opus")
-            else:
-                array, sampling_rate = self._decode_non_mp3_path_like(
-                    path, "opus", token_per_repo_id=token_per_repo_id
-                )
+
+        try:
+            import librosa
+            import soundfile as sf
+        except ImportError as err:
+            raise ImportError("To support decoding audio files, please install 'librosa' and 'soundfile'.") from err
+
+        audio_format = xsplitext(path)[1][1:].lower() if path is not None else None
+        if not config.IS_OPUS_SUPPORTED and audio_format == "opus":
+            raise RuntimeError(
+                "Decoding 'opus' files requires system library 'libsndfile'>=1.0.31, "
+                'You can try to update `soundfile` python library: `pip install "soundfile>=0.12.1"`. '
+            )
+        elif not config.IS_MP3_SUPPORTED and audio_format == "mp3":
+            raise RuntimeError(
+                "Decoding 'mp3' files requires system library 'libsndfile'>=1.1.0, "
+                'You can try to update `soundfile` python library: `pip install "soundfile>=0.12.1"`. '
+            )
+
+        if file is None:
+            token_per_repo_id = token_per_repo_id or {}
+            source_url = path.split("::")[-1]
+            try:
+                repo_id = string_to_dict(source_url, config.HUB_DATASETS_URL)["repo_id"]
+                use_auth_token = token_per_repo_id[repo_id]
+            except (ValueError, KeyError):
+                use_auth_token = None
+
+            with xopen(path, "rb", use_auth_token=use_auth_token) as f:
+                array, sampling_rate = sf.read(f)
+
         else:
-            if file:
-                array, sampling_rate = self._decode_non_mp3_file_like(file)
-            else:
-                array, sampling_rate = self._decode_non_mp3_path_like(path, token_per_repo_id=token_per_repo_id)
+            array, sampling_rate = sf.read(file)
+
+        array = array.T
+        if self.mono:
+            array = librosa.to_mono(array)
+        if self.sampling_rate and self.sampling_rate != sampling_rate:
+            array = librosa.resample(array, orig_sr=sampling_rate, target_sr=self.sampling_rate)
+            sampling_rate = self.sampling_rate
+
         return {"path": path, "array": array, "sampling_rate": sampling_rate}
 
     def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]:
@@ -242,129 +267,3 @@ def path_to_bytes(path):
         )
         storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null())
         return array_cast(storage, self.pa_type)
-
-    def _decode_non_mp3_path_like(
-        self, path, format=None, token_per_repo_id: Optional[Dict[str, Union[str, bool, None]]] = None
-    ):
-        try:
-            import librosa
-        except ImportError as err:
-            raise ImportError("To support decoding audio files, please install 'librosa'.") from err
-
-        token_per_repo_id = token_per_repo_id or {}
-        if format == "opus":
-            import soundfile
-
-            if version.parse(soundfile.__libsndfile_version__) < version.parse("1.0.30"):
-                raise RuntimeError(
-                    "Decoding .opus files requires 'libsndfile'>=1.0.30, "
-                    + "it can be installed via conda: `conda install -c conda-forge libsndfile>=1.0.30`"
-                )
-        source_url = path.split("::")[-1]
-        try:
-            repo_id = string_to_dict(source_url, config.HUB_DATASETS_URL)["repo_id"]
-            use_auth_token = token_per_repo_id[repo_id]
-        except (ValueError, KeyError):
-            use_auth_token = None
-
-        with xopen(path, "rb", use_auth_token=use_auth_token) as f:
-            array, sampling_rate = librosa.load(f, sr=self.sampling_rate, mono=self.mono)
-        return array, sampling_rate
-
-    def _decode_non_mp3_file_like(self, file, format=None):
-        try:
-            import librosa
-            import soundfile as sf
-        except ImportError as err:
-            raise ImportError("To support decoding audio files, please install 'librosa' and 'soundfile'.") from err
-
-        if format == "opus":
-            if version.parse(sf.__libsndfile_version__) < version.parse("1.0.30"):
-                raise RuntimeError(
-                    "Decoding .opus files requires 'libsndfile'>=1.0.30, "
-                    + 'it can be installed via conda: `conda install -c conda-forge "libsndfile>=1.0.30"`'
-                )
-        array, sampling_rate = sf.read(file)
-        array = array.T
-        if self.mono:
-            array = librosa.to_mono(array)
-        if self.sampling_rate and self.sampling_rate != sampling_rate:
-            array = librosa.resample(array, orig_sr=sampling_rate, target_sr=self.sampling_rate)
-            sampling_rate = self.sampling_rate
-        return array, sampling_rate
-
-    def _decode_mp3(self, path_or_file):
-        try:
-            import torchaudio
-        except ImportError as err:
-            raise ImportError("To support decoding 'mp3' audio files, please install 'torchaudio'.") from err
-        if version.parse(torchaudio.__version__) < version.parse("0.12.0"):
-            try:
-                torchaudio.set_audio_backend("sox_io")
-            except RuntimeError as err:
-                raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.") from err
-            array, sampling_rate = self._decode_mp3_torchaudio(path_or_file)
-        else:
-            try:  # try torchaudio anyway because sometimes it works (depending on the os and os packages installed)
-                array, sampling_rate = self._decode_mp3_torchaudio(path_or_file)
-            except RuntimeError:
-                global _ffmpeg_warned
-                if not _ffmpeg_warned:
-                    warnings.warn(
-                        "\nTo support 'mp3' decoding with `torchaudio>=0.12.0`, make sure you have `ffmpeg` system package with at least version 4 installed. "
-                        "Alternatively, you can downgrade `torchaudio`:\n\n"
-                        "\tpip install \"torchaudio<0.12\".\n\nOtherwise 'mp3' files will be decoded with `librosa`."
-                    )
-                    _ffmpeg_warned = True
-                try:
-                    # flake8: noqa
-                    import librosa
-                except ImportError as err:
-                    raise ImportError(
-                        "\nTo support 'mp3' decoding with `torchaudio>=0.12.0`, make sure you have `ffmpeg` system package with at least version 4 installed. "
-                        "\tpip install \"torchaudio<0.12\".\n\nTo decode 'mp3' files without `torchaudio`, please install `librosa`:\n\n"
-                        "\tpip install librosa\n\nNote that decoding might be extremely slow in that case."
-                    ) from err
-                # try to decode with librosa for torchaudio>=0.12.0 as a workaround
-                global _librosa_warned
-                if not _librosa_warned:
-                    warnings.warn("Decoding mp3 with `librosa` instead of `torchaudio`, decoding might be slow.")
-                    _librosa_warned = True
-                try:
-                    array, sampling_rate = self._decode_mp3_librosa(path_or_file)
-                except RuntimeError as err:
-                    raise RuntimeError(
-                        "Decoding of 'mp3' failed, probably because of streaming mode "
-                        "(`librosa` cannot decode 'mp3' file-like objects, only path-like)."
-                    ) from err
-
-        return array, sampling_rate
-
-    def _decode_mp3_torchaudio(self, path_or_file):
-        import torchaudio
-        import torchaudio.transforms as T
-
-        array, sampling_rate = torchaudio.load(path_or_file, format="mp3")
-        if self.sampling_rate and self.sampling_rate != sampling_rate:
-            if not hasattr(self, "_resampler") or self._resampler.orig_freq != sampling_rate:
-                self._resampler = T.Resample(sampling_rate, self.sampling_rate)
-            array = self._resampler(array)
-            sampling_rate = self.sampling_rate
-        array = array.numpy()
-        if self.mono:
-            array = array.mean(axis=0)
-        return array, sampling_rate
-
-    def _decode_mp3_librosa(self, path_or_file):
-        import librosa
-
-        global _audioread_warned
-
-        with warnings.catch_warnings():
-            if _audioread_warned:
-                warnings.filterwarnings("ignore", "pysoundfile failed.+?", UserWarning, module=librosa.__name__)
-            else:
-                _audioread_warned = True
-            array, sampling_rate = librosa.load(path_or_file, mono=self.mono, sr=self.sampling_rate)
-
-        return array, sampling_rate
diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py
@@ -640,9 +640,13 @@ def _save_regex(pickler, obj):
 
                     @pklregister(obj_type)
                     def _save_tensor(pickler, obj):
+                        # `torch.from_numpy` is not picklable in `torch>=1.11.0`
+                        def _create_tensor(np_array):
+                            return torch.from_numpy(np_array)
+
                         dill_log(pickler, f"To: {obj}")
                         args = (obj.detach().cpu().numpy(),)
-                        pickler.save_reduce(torch.from_numpy, args, obj=obj)
+                        pickler.save_reduce(_create_tensor, args, obj=obj)
                         dill_log(pickler, "# To")
                         return
 
diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
diff --git a/tests/utils.py b/tests/utils.py

Original file line number	Diff line number	Diff line change
`@@ -142,6 +142,7 @@`
`142`	`142`	`]`
`143`	`143`
`144`	`144`	`AUDIO_REQUIRE = [`
	`145`	`+ "soundfile>=0.12.1",`
`145`	`146`	`"librosa",`
`146`	`147`	`]`
`147`	`148`
`@@ -176,8 +177,7 @@`
`176`	`177`	`"tensorflow-macos; sys_platform == 'darwin' and platform_machine == 'arm64'",`
`177`	`178`	`"tiktoken;python_version>='3.8'",`
`178`	`179`	`"torch",`
`179`		`- "torchaudio<0.12.0",`
`180`		`- "soundfile",`
	`180`	`+ "soundfile>=0.12.1",`
`181`	`181`	`"transformers",`
`182`	`182`	`"zstandard",`
`183`	`183`	`]`