Add load_with_torchcodec, modify load()'s warnings (#3974)

NicolasHug · web-flow · commit 800b9dc479b8 · 2025-07-15T14:39:42.000+01:00
diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh
@@ -74,7 +74,7 @@ case $GPU_ARCH_TYPE in
     ;;
 esac
 PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}"
-pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}"
+pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}"
 
 
 # 2. Install torchaudio
@@ -86,6 +86,10 @@ python setup.py install
 
 # 3. Install Test tools
 printf "* Installing test tools\n"
+# On this CI, for whatever reason, we're only able to install ffmpeg 4.
+conda install -y "ffmpeg<5"
+python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)"
+
 NUMBA_DEV_CHANNEL=""
 if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then
     # Numba isn't available for Python 3.9 and 3.10 except on the numba dev channel and building from source fails
@@ -94,7 +98,7 @@ if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then
 fi
 (
     set -x
-    conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} sox libvorbis parameterized 'requests>=2.20' 'ffmpeg>=6,<7'
+    conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} sox libvorbis parameterized 'requests>=2.20'
     pip install kaldi-io SoundFile librosa coverage pytest pytest-cov scipy expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm
 
     # TODO: might be better to fix the single call to `pip install` above
diff --git a/docs/source/torchaudio.rst b/docs/source/torchaudio.rst
@@ -7,9 +7,11 @@ torchaudio
     Starting with version 2.8, we are refactoring TorchAudio to transition it
     into a maintenance phase. As a result:
 
-    - The APIs listed below are deprecated in 2.8 and will be removed in 2.9.
+    - Most APIs listed below are deprecated in 2.8 and will be removed in 2.9.
     - The decoding and encoding capabilities of PyTorch for both audio and video
-      are being consolidated into TorchCodec.
+      are being consolidated into TorchCodec. We provide
+      ``torchaudio.load_with_torchcodec()`` as a replacement for
+      ``torchaudio.load()``.
 
     Please see https://github.com/pytorch/audio/issues/3902 for more information.
 
@@ -26,6 +28,7 @@ it easy to handle audio data.
 
    info
    load
+   load_with_torchcodec
    save
    list_audio_backends
 
diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py
@@ -7,16 +7,16 @@
     get_audio_backend as _get_audio_backend,
     info as _info,
     list_audio_backends as _list_audio_backends,
-    load as _load,
+    load,
     save as _save,
     set_audio_backend as _set_audio_backend,
 )
+from ._torchcodec import load_with_torchcodec
 
 AudioMetaData = dropping_class_io_support(_AudioMetaData)
 get_audio_backend = dropping_io_support(_get_audio_backend)
 info = dropping_io_support(_info)
 list_audio_backends = dropping_io_support(_list_audio_backends)
-load = dropping_io_support(_load)
 save = dropping_io_support(_save)
 set_audio_backend = dropping_io_support(_set_audio_backend)
 
@@ -45,6 +45,7 @@
 __all__ = [
     "AudioMetaData",
     "load",
+    "load_with_torchcodec",
     "info",
     "save",
     "io",
diff --git a/src/torchaudio/_backend/utils.py b/src/torchaudio/_backend/utils.py
@@ -1,6 +1,7 @@
 import os
 from functools import lru_cache
 from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
+import warnings
 
 import torch
 
@@ -127,6 +128,14 @@ def load(
     ) -> Tuple[torch.Tensor, int]:
         """Load audio data from source.
 
+        .. warning::
+            In 2.9, this function's implementation will be changed to use
+            :func:`~torchaudio.load_with_torchcodec` under the hood. Some
+            parameters like ``normalize``, ``format``, ``buffer_size``, and
+            ``backend`` will be ignored. We recommend that you port your code to
+            rely directly on TorchCodec's decoder instead:
+            https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.html#torchcodec.decoders.AudioDecoder.
+
         By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
         ``float32`` dtype, and the shape of `[channel, time]`.
 
@@ -201,6 +210,14 @@ def load(
                 integer type, else ``float32`` type. If ``channels_first=True``, it has
                 `[channel, time]` else `[time, channel]`.
         """
+        warnings.warn(
+            "In 2.9, this function's implementation will be changed to use "
+            "torchaudio.load_with_torchcodec` under the hood. Some "
+            "parameters like ``normalize``, ``format``, ``buffer_size``, and "
+            "``backend`` will be ignored. We recommend that you port your code to "
+            "rely directly on TorchCodec's decoder instead: "
+            "https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.html#torchcodec.decoders.AudioDecoder."
+        )
         backend = dispatcher(uri, format, backend)
         return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
 
diff --git a/src/torchaudio/_torchcodec.py b/src/torchaudio/_torchcodec.py
@@ -0,0 +1,161 @@
+"""TorchCodec integration for TorchAudio."""
+
+import os
+from typing import BinaryIO, Optional, Tuple, Union
+
+import torch
+
+
+def load_with_torchcodec(
+    uri: Union[BinaryIO, str, os.PathLike],
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+    buffer_size: int = 4096,
+    backend: Optional[str] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Load audio data from source using TorchCodec's AudioDecoder.
+    
+    .. note::
+        
+        This function supports the same API as ``torchaudio.load()``, and relies
+        on TorchCodec's decoding capabilities under the hood. It is provided for
+        convenience, but we do recommend that you port your code to natively use
+        ``torchcodec``'s ``AudioDecoder`` class for better performance:
+        https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.
+        In TorchAudio 2.9, ``torchaudio.load()`` will be relying on
+        ``load_with_torchcodec``. Note that some parameters of
+        ``torchaudio.load()``, like ``normalize``, ``buffer_size``, and
+        ``backend``, are ignored by ``load_with_torchcodec``.
+    
+    
+    Args:
+        uri (path-like object or file-like object):
+            Source of audio data. The following types are accepted:
+            
+            * ``path-like``: File path or URL.
+            * ``file-like``: Object with ``read(size: int) -> bytes`` method.
+            
+        frame_offset (int, optional):
+            Number of samples to skip before start reading data.
+        num_frames (int, optional):
+            Maximum number of samples to read. ``-1`` reads all the remaining samples,
+            starting from ``frame_offset``.
+        normalize (bool, optional):
+            TorchCodec always returns normalized float32 samples. This parameter
+            is ignored and a warning is issued if set to False.
+            Default: ``True``.
+        channels_first (bool, optional):
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Format hint for the decoder. May not be supported by all TorchCodec
+            decoders. (Default: ``None``)
+        buffer_size (int, optional):
+            Not used by TorchCodec AudioDecoder. Provided for API compatibility.
+        backend (str or None, optional):
+            Not used by TorchCodec AudioDecoder. Provided for API compatibility.
+            
+    Returns:
+        (torch.Tensor, int): Resulting Tensor and sample rate.
+        Always returns float32 tensors. If ``channels_first=True``, shape is
+        `[channel, time]`, otherwise `[time, channel]`.
+        
+    Raises:
+        ImportError: If torchcodec is not available.
+        ValueError: If unsupported parameters are used.
+        RuntimeError: If TorchCodec fails to decode the audio.
+        
+    Note:
+        - TorchCodec always returns normalized float32 samples, so the ``normalize``
+          parameter has no effect.
+        - The ``buffer_size`` and ``backend`` parameters are ignored.
+        - Not all audio formats supported by torchaudio backends may be supported
+          by TorchCodec.
+    """
+    # Import torchcodec here to provide clear error if not available
+    try:
+        from torchcodec.decoders import AudioDecoder
+    except ImportError as e:
+        raise ImportError(
+            "TorchCodec is required for load_with_torchcodec. "
+            "Please install torchcodec to use this function."
+        ) from e
+    
+    # Parameter validation and warnings
+    if not normalize:
+        import warnings
+        warnings.warn(
+            "TorchCodec AudioDecoder always returns normalized float32 samples. "
+            "The 'normalize=False' parameter is ignored.",
+            UserWarning,
+            stacklevel=2
+        )
+    
+    if buffer_size != 4096:
+        import warnings
+        warnings.warn(
+            "The 'buffer_size' parameter is not used by TorchCodec AudioDecoder.",
+            UserWarning,
+            stacklevel=2
+        )
+        
+    if backend is not None:
+        import warnings
+        warnings.warn(
+            "The 'backend' parameter is not used by TorchCodec AudioDecoder.",
+            UserWarning,
+            stacklevel=2
+        )
+    
+    if format is not None:
+        import warnings
+        warnings.warn(
+            "The 'format' parameter is not supported by TorchCodec AudioDecoder.",
+            UserWarning,
+            stacklevel=2
+        )
+    
+    # Create AudioDecoder
+    try:
+        decoder = AudioDecoder(uri)
+    except Exception as e:
+        raise RuntimeError(f"Failed to create AudioDecoder for {uri}: {e}") from e
+    
+    # Get sample rate from metadata
+    sample_rate = decoder.metadata.sample_rate
+    if sample_rate is None:
+        raise RuntimeError("Unable to determine sample rate from audio metadata")
+    
+    # Decode the entire file first, then subsample manually
+    # This is the simplest approach since torchcodec uses time-based indexing
+    try:
+        audio_samples = decoder.get_all_samples()
+    except Exception as e:
+        raise RuntimeError(f"Failed to decode audio samples: {e}") from e
+        
+    data = audio_samples.data
+    
+    # Apply frame_offset and num_frames (which are actually sample offsets)
+    if frame_offset > 0:
+        if frame_offset >= data.shape[1]:
+            # Return empty tensor if offset is beyond available data
+            empty_shape = (data.shape[0], 0) if channels_first else (0, data.shape[0])
+            return torch.zeros(empty_shape, dtype=torch.float32), sample_rate
+        data = data[:, frame_offset:]
+    
+    if num_frames == 0:
+        # Return empty tensor if num_frames is 0
+        empty_shape = (data.shape[0], 0) if channels_first else (0, data.shape[0])
+        return torch.zeros(empty_shape, dtype=torch.float32), sample_rate
+    elif num_frames > 0:
+        data = data[:, :num_frames]
+    
+    # TorchCodec returns data in [channel, time] format by default
+    # Handle channels_first parameter
+    if not channels_first:
+        data = data.transpose(0, 1)  # [channel, time] -> [time, channel]
+    
+    return data, sample_rate
diff --git a/test/torchaudio_unittest/test_load_torchcodec.py b/test/torchaudio_unittest/test_load_torchcodec.py