fix(audio): migrate from deprecated torchaudio (#1406)

shcheklein · web-flow · commit b2d9fccc2678 · 2025-10-15T20:07:04.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -73,7 +73,6 @@ torch = [
   "transformers>=4.36.0"
 ]
 audio = [
-  "torchaudio",
   "soundfile"
 ]
 remote = [
@@ -88,7 +87,11 @@ hf = [
   "datasets[vision]>=4.0.0",
   # https://github.com/pytorch/torchcodec/issues/640
   "datasets[audio]>=4.0.0 ; (sys_platform == 'linux' or sys_platform == 'darwin')",
-  "fsspec>=2024.12.0"
+  "fsspec>=2024.12.0",
+  # Until datasets solve the issue, run test_hf_audio test to see if this can be removed
+  # https://github.com/meta-pytorch/torchcodec/issues/912
+  # https://github.com/huggingface/transformers/pull/41610
+  "torch<2.9.0"
 ]
 video = [
   "ffmpeg-python",
@@ -134,7 +137,9 @@ examples = [
   "huggingface_hub[hf_transfer]",
   "ultralytics",
   "open_clip_torch",
-  "openai"
+  "openai",
+  # Transformers still require it
+  "torchaudio<2.9.0"
 ]
 
 [project.urls]
diff --git a/src/datachain/lib/audio.py b/src/datachain/lib/audio.py
@@ -1,4 +1,5 @@
 import posixpath
+import re
 from typing import TYPE_CHECKING
 
 from datachain.lib.file import FileError
@@ -9,7 +10,7 @@
     from datachain.lib.file import Audio, AudioFile, File
 
 try:
-    import torchaudio
+    import soundfile as sf
 except ImportError as exc:
     raise ImportError(
         "Missing dependencies for processing audio.\n"
@@ -26,18 +27,25 @@ def audio_info(file: "File | AudioFile") -> "Audio":
 
     try:
         with file.open() as f:
-            info = torchaudio.info(f)
+            info = sf.info(f)
+
+            sample_rate = int(info.samplerate)
+            channels = int(info.channels)
+            frames = int(info.frames)
+            duration = float(info.duration)
 
-            sample_rate = int(info.sample_rate)
-            channels = int(info.num_channels)
-            frames = int(info.num_frames)
-            duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
+            # soundfile provides format and subtype
+            if info.format:
+                format_name = info.format.lower()
+            else:
+                format_name = file.get_file_ext().lower()
 
-            codec_name = getattr(info, "encoding", "")
-            file_ext = file.get_file_ext().lower()
-            format_name = _encoding_to_format(codec_name, file_ext)
+            if not format_name:
+                format_name = "unknown"
+            codec_name = info.subtype if info.subtype else ""
 
-            bits_per_sample = getattr(info, "bits_per_sample", 0)
+            # Calculate bit rate from subtype
+            bits_per_sample = _get_bits_per_sample(info.subtype)
             bit_rate = (
                 bits_per_sample * sample_rate * channels if bits_per_sample > 0 else -1
             )
@@ -58,44 +66,39 @@ def audio_info(file: "File | AudioFile") -> "Audio":
     )
 
 
-def _encoding_to_format(encoding: str, file_ext: str) -> str:
+def _get_bits_per_sample(subtype: str) -> int:
     """
-    Map torchaudio encoding to a format name.
+    Map soundfile subtype to bits per sample.
 
     Args:
-        encoding: The encoding string from torchaudio.info()
-        file_ext: The file extension as a fallback
+        subtype: The subtype string from soundfile
 
     Returns:
-        Format name as a string
+        Bits per sample, or 0 if unknown
     """
-    # Direct mapping for formats that match exactly
-    encoding_map = {
-        "FLAC": "flac",
-        "MP3": "mp3",
-        "VORBIS": "ogg",
-        "AMR_WB": "amr",
-        "AMR_NB": "amr",
-        "OPUS": "opus",
-        "GSM": "gsm",
+    if not subtype:
+        return 0
+
+    # Common PCM and floating-point subtypes
+    pcm_bits = {
+        "PCM_16": 16,
+        "PCM_24": 24,
+        "PCM_32": 32,
+        "PCM_S8": 8,
+        "PCM_U8": 8,
+        "FLOAT": 32,
+        "DOUBLE": 64,
     }
 
-    if encoding in encoding_map:
-        return encoding_map[encoding]
+    if subtype in pcm_bits:
+        return pcm_bits[subtype]
 
-    # For PCM variants, use file extension to determine format
-    if encoding.startswith("PCM_"):
-        # Common PCM formats by extension
-        pcm_formats = {
-            "wav": "wav",
-            "aiff": "aiff",
-            "au": "au",
-            "raw": "raw",
-        }
-        return pcm_formats.get(file_ext, "wav")  # Default to wav for PCM
+    # Handle variants such as PCM_S16LE, PCM_F32LE, etc.
+    match = re.search(r"PCM_(?:[A-Z]*?)(\d+)", subtype)
+    if match:
+        return int(match.group(1))
 
-    # Fallback to file extension if encoding is unknown
-    return file_ext if file_ext else "unknown"
+    return 0
 
 
 def audio_to_np(
@@ -114,27 +117,27 @@ def audio_to_np(
 
     try:
         with audio.open() as f:
-            info = torchaudio.info(f)
-            sample_rate = info.sample_rate
+            info = sf.info(f)
+            sample_rate = info.samplerate
 
             frame_offset = int(start * sample_rate)
             num_frames = int(duration * sample_rate) if duration is not None else -1
 
             # Reset file pointer to the beginning
-            # This is important to ensure we read from the correct position later
             f.seek(0)
 
-            waveform, sr = torchaudio.load(
-                f, frame_offset=frame_offset, num_frames=num_frames
+            # Read audio data with offset and frame count
+            audio_np, sr = sf.read(
+                f,
+                start=frame_offset,
+                frames=num_frames,
+                always_2d=False,
+                dtype="float32",
             )
 
-            audio_np = waveform.numpy()
-
-            if audio_np.shape[0] > 1:
-                audio_np = audio_np.T
-            else:
-                audio_np = audio_np.squeeze()
-
+            # soundfile returns shape (frames,) for mono or
+            # (frames, channels) for multi-channel
+            # We keep this format as it matches expected output
             return audio_np, int(sr)
     except Exception as exc:
         raise FileError(
@@ -152,11 +155,9 @@ def audio_to_bytes(
 
     If duration is None, converts from start to end of file.
     If start is 0 and duration is None, converts entire file."""
-    y, sr = audio_to_np(audio, start, duration)
-
     import io
 
-    import soundfile as sf
+    y, sr = audio_to_np(audio, start, duration)
 
     buffer = io.BytesIO()
     sf.write(buffer, y, sr, format=format)
diff --git a/tests/unit/lib/test_audio.py b/tests/unit/lib/test_audio.py
@@ -274,9 +274,7 @@ def test_save_audio_auto_format(tmp_path, catalog):
 
 def test_audio_info_file_error(audio_file):
     """Test audio_info handles file errors properly."""
-    with patch(
-        "datachain.lib.audio.torchaudio.info", side_effect=Exception("Test error")
-    ):
+    with patch("datachain.lib.audio.sf.info", side_effect=Exception("Test error")):
         with pytest.raises(
             FileError, match="unable to extract metadata from audio file"
         ):
@@ -285,9 +283,7 @@ def test_audio_info_file_error(audio_file):
 
 def test_audio_fragment_np_file_error(audio_file):
     """Test audio_fragment_np handles file errors properly."""
-    with patch(
-        "datachain.lib.audio.torchaudio.info", side_effect=Exception("Test error")
-    ):
+    with patch("datachain.lib.audio.sf.info", side_effect=Exception("Test error")):
         with pytest.raises(FileError, match="unable to read audio fragment"):
             audio_to_np(audio_file)
 
@@ -322,34 +318,30 @@ def test_audio_to_bytes_formats(audio_file, format_type):
 
 
 @pytest.mark.parametrize(
-    "encoding,file_ext,expected_format",
+    "format_str,subtype,file_ext,expected_format,expected_bit_rate",
     [
-        # Test direct encoding mappings
-        ("FLAC", "flac", "flac"),
-        ("MP3", "mp3", "mp3"),
-        ("VORBIS", "ogg", "ogg"),
-        ("OPUS", "opus", "opus"),
-        ("AMR_WB", "amr", "amr"),
-        ("AMR_NB", "amr", "amr"),
-        ("GSM", "gsm", "gsm"),
-        # Test PCM variants with different extensions
-        ("PCM_S16LE", "wav", "wav"),
-        ("PCM_S24LE", "aiff", "aiff"),
-        ("PCM_F32LE", "au", "au"),
-        ("PCM_U8", "raw", "raw"),
-        ("PCM_S16BE", "unknown_ext", "wav"),  # Default for PCM
-        # Test unknown encoding falls back to file extension
-        ("UNKNOWN_CODEC", "mp3", "mp3"),
-        ("UNKNOWN_CODEC", "flac", "flac"),
-        # Test files without extension
-        ("UNKNOWN_CODEC", "", "unknown"),
-        ("", "", "unknown"),
+        # Direct format mappings from soundfile
+        ("WAV", "PCM_16", "wav", "wav", 16 * 16000),
+        ("FLAC", "PCM_16", "flac", "flac", 16 * 16000),
+        ("OGG", "VORBIS", "ogg", "ogg", -1),
+        ("AIFF", "PCM_24", "aiff", "aiff", 24 * 16000),
+        # Format fallback to file extension when subtype is PCM
+        (None, "PCM_16", "wav", "wav", 16 * 16000),
+        (None, "PCM_24", "aiff", "aiff", 24 * 16000),
+        (None, "PCM_S16LE", "au", "au", 16 * 16000),
+        (None, "PCM_F32LE", "wav", "wav", 32 * 16000),
+        # Unknown format with extension falls back to extension
+        (None, "UNKNOWN_CODEC", "mp3", "mp3", -1),
+        ("", "UNKNOWN_CODEC", "flac", "flac", -1),
+        # Files without extension should fall back to "unknown"
+        (None, "PCM_16", "", "unknown", 16 * 16000),
+        ("", "UNKNOWN_CODEC", "", "unknown", -1),
     ],
 )
 def test_audio_info_format_detection(
-    tmp_path, catalog, encoding, file_ext, expected_format
+    tmp_path, catalog, format_str, subtype, file_ext, expected_format, expected_bit_rate
 ):
-    """Test audio format detection for different file extensions and encodings."""
+    """Test audio format detection for different file extensions and formats."""
     # Create a test audio file with the specified extension
     filename = f"test_audio.{file_ext}" if file_ext else "test_audio"
     audio_data = generate_test_wav(duration=0.1, sample_rate=16000)
@@ -359,18 +351,20 @@ def test_audio_info_format_detection(
     audio_file = AudioFile(path=str(audio_path), source="file://")
     audio_file._set_stream(catalog, caching_enabled=False)
 
-    # Mock torchaudio.info to return controlled encoding
-    with patch("datachain.lib.audio.torchaudio.info") as mock_info:
-        mock_info.return_value.sample_rate = 16000
-        mock_info.return_value.num_channels = 1
-        mock_info.return_value.num_frames = 1600  # 0.1 seconds
-        mock_info.return_value.encoding = encoding
-        mock_info.return_value.bits_per_sample = 16
+    # Mock soundfile.info to return controlled format
+    with patch("datachain.lib.audio.sf.info") as mock_info:
+        mock_info.return_value.samplerate = 16000
+        mock_info.return_value.channels = 1
+        mock_info.return_value.frames = 1600  # 0.1 seconds
+        mock_info.return_value.duration = 0.1
+        mock_info.return_value.format = format_str
+        mock_info.return_value.subtype = subtype
 
         result = audio_info(audio_file)
 
         assert result.format == expected_format
-        assert result.codec == encoding
+        assert result.codec == subtype
+        assert result.bit_rate == expected_bit_rate
 
 
 def test_audio_info_stereo(stereo_audio_file):