NVIDIA-NeMo · blisc · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/docker/Dockerfile.speech b/docker/Dockerfile.speech
@@ -22,8 +22,6 @@ ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
 FROM ${BASE_IMAGE} as nemo-deps
 
 # dependency flags; should be declared after FROM
-# torchaudio: not required by default
-ARG REQUIRE_TORCHAUDIO=false
 # k2: not required by default
 ARG REQUIRE_K2=false
 # ais cli: not required by default, install only if required
@@ -96,18 +94,6 @@ WORKDIR /tmp/
 # uninstall stuff from base container
 RUN pip3 uninstall -y sacrebleu torchtext
 
-# build torchaudio
-WORKDIR /tmp/torchaudio_build
-COPY scripts/installers /tmp/torchaudio_build/scripts/installers/
-RUN INSTALL_MSG=$(/bin/bash /tmp/torchaudio_build/scripts/installers/install_torchaudio_latest.sh); INSTALL_CODE=$?; \
-  echo ${INSTALL_MSG}; \
-  if [ ${INSTALL_CODE} -ne 0 ]; then \
-  echo "torchaudio installation failed";  \
-  if [ "${REQUIRE_TORCHAUDIO}" = true ]; then \
-  exit ${INSTALL_CODE};  \
-  else echo "Skipping failed torchaudio installation"; fi \
-  else echo "torchaudio installed successfully"; fi
-
 COPY scripts /tmp/nemo/scripts/
 # install correct graphviz version (k2 and pynini visualization tool), skip if installation fails
 RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_graphviz.sh --docker); INSTALL_CODE=$?; \

diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py
@@ -24,7 +24,7 @@
 import torch
 from omegaconf import OmegaConf
 
-from nemo.collections.audio.models import AudioToAudioModel
+from nemo.collections.audio.models.audio_to_audio import AudioToAudioModel
 from nemo.core.config import hydra_runner
 from nemo.utils import logging, model_utils
 

diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py
@@ -19,11 +19,11 @@
 from typing import Any, Optional
 
 import torch
-from packaging import version
 
 from nemo.collections.asr.parts.numba.spec_augment import SpecAugmentNumba, spec_augment_launch_heuristics
-from nemo.collections.asr.parts.preprocessing.features import FilterbankFeatures, FilterbankFeaturesTA
+from nemo.collections.asr.parts.preprocessing.features import FilterbankFeatures
 from nemo.collections.asr.parts.submodules.spectr_augment import SpecAugment, SpecCutout
+from nemo.collections.audio.parts.utils.transforms import MFCC
 from nemo.core.classes import Exportable, NeuralModule, typecheck
 from nemo.core.neural_types import (
     AudioSignal,
@@ -37,18 +37,6 @@
 from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__
 from nemo.utils import logging, logging_mode
 
-try:
-    import torchaudio
-    import torchaudio.functional
-    import torchaudio.transforms
-
-    TORCHAUDIO_VERSION = version.parse(torchaudio.__version__)
-    TORCHAUDIO_VERSION_MIN = version.parse('0.5')
-
-    HAVE_TORCHAUDIO = True
-except ModuleNotFoundError:
-    HAVE_TORCHAUDIO = False
-
 __all__ = [
     'AudioToMelSpectrogramPreprocessor',
     'AudioToMFCCPreprocessor',
@@ -171,7 +159,6 @@ class AudioToMelSpectrogramPreprocessor(AudioPreprocessor, Exportable):
             Defaults to 0.0
         nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation.
             Defaults to 4000
-        use_torchaudio: Whether to use the `torchaudio` implementation.
         mel_norm: Normalization used for mel filterbank weights.
             Defaults to 'slaney' (area normalization)
         stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints.
@@ -237,8 +224,8 @@ def __init__(
         rng=None,
         nb_augmentation_prob=0.0,
         nb_max_freq=4000,
-        use_torchaudio: bool = False,
         mel_norm="slaney",
+        use_torchaudio: bool = False,  # Deprecated arguments; kept for config compatibility
         stft_exact_pad=False,  # Deprecated arguments; kept for config compatibility
         stft_conv=False,  # Deprecated arguments; kept for config compatibility
     ):
@@ -256,11 +243,7 @@ def __init__(
         super().__init__(n_window_size, n_window_stride)
 
         # Given the long and similar argument list, point to the class and instantiate it by reference
-        if not use_torchaudio:
-            featurizer_class = FilterbankFeatures
-        else:
-            featurizer_class = FilterbankFeaturesTA
-        self.featurizer = featurizer_class(
+        self.featurizer = FilterbankFeatures(
             sample_rate=self._sample_rate,
             n_window_size=n_window_size,
             n_window_stride=n_window_stride,
@@ -306,7 +289,6 @@ def filter_banks(self):
 
 class AudioToMFCCPreprocessor(AudioPreprocessor):
     """Preprocessor that converts wavs to MFCCs.
-    Uses torchaudio.transforms.MFCC.
 
     Args:
         sample_rate: The sample rate of the audio.
@@ -382,14 +364,6 @@ def __init__(
         log=True,
     ):
         self._sample_rate = sample_rate
-        if not HAVE_TORCHAUDIO:
-            logging.error('Could not import torchaudio. Some features might not work.')
-
-            raise ModuleNotFoundError(
-                "torchaudio is not installed but is necessary for "
-                "AudioToMFCCPreprocessor. We recommend you try "
-                "building it from source for the PyTorch version you have."
-            )
         if window_size and n_window_size:
             raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
         if window_stride and n_window_stride:
@@ -425,7 +399,7 @@ def __init__(
         mel_kwargs['window_fn'] = window_fn
 
         # Use torchaudio's implementation of MFCCs as featurizer
-        self.featurizer = torchaudio.transforms.MFCC(
+        self.featurizer = MFCC(
             sample_rate=self._sample_rate,
             n_mfcc=n_mfcc,
             dct_type=dct_type,
@@ -746,8 +720,8 @@ class AudioToMelSpectrogramPreprocessorConfig:
     rng: Optional[str] = None
     nb_augmentation_prob: float = 0.0
     nb_max_freq: int = 4000
-    use_torchaudio: bool = False
     mel_norm: str = "slaney"
+    use_torchaudio: bool = False  # Deprecated argument, kept for compatibility with older checkpoints.
     stft_exact_pad: bool = False  # Deprecated argument, kept for compatibility with older checkpoints.
     stft_conv: bool = False  # Deprecated argument, kept for compatibility with older checkpoints.
 

diff --git a/nemo/collections/asr/parts/preprocessing/features.py b/nemo/collections/asr/parts/preprocessing/features.py
@@ -34,7 +34,6 @@
 # This file contains code artifacts adapted from https://github.com/ryanleary/patter
 import math
 import random
-from typing import Optional, Tuple, Union
 
 import librosa
 import numpy as np
@@ -45,14 +44,6 @@
 from nemo.collections.asr.parts.preprocessing.segment import AudioSegment
 from nemo.utils import logging
 
-try:
-    import torchaudio
-
-    HAVE_TORCHAUDIO = True
-except ModuleNotFoundError:
-    HAVE_TORCHAUDIO = False
-
-
 CONSTANT = 1e-5
 
 
@@ -499,187 +490,3 @@ def forward(self, x, seq_len, linear_spec=False):
             if pad_amt != 0:
                 x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value)
         return x, seq_len
-
-
-class FilterbankFeaturesTA(nn.Module):
-    """
-    Exportable, `torchaudio`-based implementation of Mel Spectrogram extraction.
-
-    See `AudioToMelSpectrogramPreprocessor` for args.
-
-    """
-
-    def __init__(
-        self,
-        sample_rate: int = 16000,
-        n_window_size: int = 320,
-        n_window_stride: int = 160,
-        normalize: Optional[str] = "per_feature",
-        nfilt: int = 64,
-        n_fft: Optional[int] = None,
-        preemph: float = 0.97,
-        lowfreq: float = 0,
-        highfreq: Optional[float] = None,
-        log: bool = True,
-        log_zero_guard_type: str = "add",
-        log_zero_guard_value: Union[float, str] = 2**-24,
-        dither: float = 1e-5,
-        window: str = "hann",
-        pad_to: int = 0,
-        pad_value: float = 0.0,
-        mel_norm="slaney",
-        # Seems like no one uses these options anymore. Don't convolute the code by supporting thm.
-        use_grads: bool = False,  # Deprecated arguments; kept for config compatibility
-        max_duration: float = 16.7,  # Deprecated arguments; kept for config compatibility
-        frame_splicing: int = 1,  # Deprecated arguments; kept for config compatibility
-        exact_pad: bool = False,  # Deprecated arguments; kept for config compatibility
-        nb_augmentation_prob: float = 0.0,  # Deprecated arguments; kept for config compatibility
-        nb_max_freq: int = 4000,  # Deprecated arguments; kept for config compatibility
-        mag_power: float = 2.0,  # Deprecated arguments; kept for config compatibility
-        rng: Optional[random.Random] = None,  # Deprecated arguments; kept for config compatibility
-        stft_exact_pad: bool = False,  # Deprecated arguments; kept for config compatibility
-        stft_conv: bool = False,  # Deprecated arguments; kept for config compatibility
-    ):
-        super().__init__()
-        if not HAVE_TORCHAUDIO:
-            raise ValueError(f"Need to install torchaudio to instantiate a {self.__class__.__name__}")
-
-        # Make sure log zero guard is supported, if given as a string
-        supported_log_zero_guard_strings = {"eps", "tiny"}
-        if isinstance(log_zero_guard_value, str) and log_zero_guard_value not in supported_log_zero_guard_strings:
-            raise ValueError(
-                f"Log zero guard value must either be a float or a member of {supported_log_zero_guard_strings}"
-            )
-
-        # Copied from `AudioPreprocessor` due to the ad-hoc structuring of the Mel Spec extractor class
-        self.torch_windows = {
-            'hann': torch.hann_window,
-            'hamming': torch.hamming_window,
-            'blackman': torch.blackman_window,
-            'bartlett': torch.bartlett_window,
-            'ones': torch.ones,
-            None: torch.ones,
-        }
-
-        # Ensure we can look up the window function
-        if window not in self.torch_windows:
-            raise ValueError(f"Got window value '{window}' but expected a member of {self.torch_windows.keys()}")
-
-        self.win_length = n_window_size
-        self.hop_length = n_window_stride
-        self._sample_rate = sample_rate
-        self._normalize_strategy = normalize
-        self._use_log = log
-        self._preemphasis_value = preemph
-        self.log_zero_guard_type = log_zero_guard_type
-        self.log_zero_guard_value: Union[str, float] = log_zero_guard_value
-        self.dither = dither
-        self.pad_to = pad_to
-        self.pad_value = pad_value
-        self.n_fft = n_fft
-        self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = torchaudio.transforms.MelSpectrogram(
-            sample_rate=self._sample_rate,
-            win_length=self.win_length,
-            hop_length=self.hop_length,
-            n_mels=nfilt,
-            window_fn=self.torch_windows[window],
-            mel_scale="slaney",
-            norm=mel_norm,
-            n_fft=n_fft,
-            f_max=highfreq,
-            f_min=lowfreq,
-            wkwargs={"periodic": False},
-        )
-
-    @property
-    def filter_banks(self):
-        """Matches the analogous class"""
-        return self._mel_spec_extractor.mel_scale.fb
-
-    def _resolve_log_zero_guard_value(self, dtype: torch.dtype) -> float:
-        if isinstance(self.log_zero_guard_value, float):
-            return self.log_zero_guard_value
-        return getattr(torch.finfo(dtype), self.log_zero_guard_value)
-
-    def _apply_dithering(self, signals: torch.Tensor) -> torch.Tensor:
-        if self.training and self.dither > 0.0:
-            noise = torch.randn_like(signals) * self.dither
-            signals = signals + noise
-        return signals
-
-    def _apply_preemphasis(self, signals: torch.Tensor) -> torch.Tensor:
-        if self._preemphasis_value is not None:
-            padded = torch.nn.functional.pad(signals, (1, 0))
-            signals = signals - self._preemphasis_value * padded[:, :-1]
-        return signals
-
-    def _compute_output_lengths(self, input_lengths: torch.Tensor) -> torch.Tensor:
-        out_lengths = input_lengths.div(self.hop_length, rounding_mode="floor").add(1).long()
-        return out_lengths
-
-    def _apply_pad_to(self, features: torch.Tensor) -> torch.Tensor:
-        # Only apply during training; else need to capture dynamic shape for exported models
-        if not self.training or self.pad_to == 0 or features.shape[-1] % self.pad_to == 0:
-            return features
-        pad_length = self.pad_to - (features.shape[-1] % self.pad_to)
-        return torch.nn.functional.pad(features, pad=(0, pad_length), value=self.pad_value)
-
-    def _apply_log(self, features: torch.Tensor) -> torch.Tensor:
-        if self._use_log:
-            zero_guard = self._resolve_log_zero_guard_value(features.dtype)
-            if self.log_zero_guard_type == "add":
-                features = features + zero_guard
-            elif self.log_zero_guard_type == "clamp":
-                features = features.clamp(min=zero_guard)
-            else:
-                raise ValueError(f"Unsupported log zero guard type: '{self.log_zero_guard_type}'")
-            features = features.log()
-        return features
-
-    def _extract_spectrograms(self, signals: torch.Tensor) -> torch.Tensor:
-        # Complex FFT needs to be done in single precision
-        with torch.amp.autocast('cuda', enabled=False):
-            features = self._mel_spec_extractor(waveform=signals)
-        return features
-
-    def _apply_normalization(self, features: torch.Tensor, lengths: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
-        # For consistency, this function always does a masked fill even if not normalizing.
-        mask: torch.Tensor = make_seq_mask_like(lengths=lengths, like=features, time_dim=-1, valid_ones=False)
-        features = features.masked_fill(mask, 0.0)
-        # Maybe don't normalize
-        if self._normalize_strategy is None:
-            return features
-        # Use the log zero guard for the sqrt zero guard
-        guard_value = self._resolve_log_zero_guard_value(features.dtype)
-        if self._normalize_strategy == "per_feature" or self._normalize_strategy == "all_features":
-            # 'all_features' reduces over each sample; 'per_feature' reduces over each channel
-            reduce_dim = 2
-            if self._normalize_strategy == "all_features":
-                reduce_dim = [1, 2]
-            # [B, D, T] -> [B, D, 1] or [B, 1, 1]
-            means = features.sum(dim=reduce_dim, keepdim=True).div(lengths.view(-1, 1, 1))
-            stds = (
-                features.sub(means)
-                .masked_fill(mask, 0.0)
-                .pow(2.0)
-                .sum(dim=reduce_dim, keepdim=True)  # [B, D, T] -> [B, D, 1] or [B, 1, 1]
-                .div(lengths.view(-1, 1, 1) - 1)  # assume biased estimator
-                .clamp(min=guard_value)  # avoid sqrt(0)
-                .sqrt()
-            )
-            features = (features - means) / (stds + eps)
-        else:
-            # Deprecating constant std/mean
-            raise ValueError(f"Unsupported norm type: '{self._normalize_strategy}")
-        features = features.masked_fill(mask, 0.0)
-        return features
-
-    def forward(self, input_signal: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        feature_lengths = self._compute_output_lengths(input_lengths=length)
-        signals = self._apply_dithering(signals=input_signal)
-        signals = self._apply_preemphasis(signals=signals)
-        features = self._extract_spectrograms(signals=signals)
-        features = self._apply_log(features=features)
-        features = self._apply_normalization(features=features, lengths=feature_lengths)
-        features = self._apply_pad_to(features=features)
-        return features, feature_lengths
diff --git a/nemo/collections/audio/losses/__init__.py b/nemo/collections/audio/losses/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from nemo.collections.audio.losses.audio import MAELoss, MSELoss, SDRLoss
diff --git a/nemo/collections/audio/losses/maxine/losses_combined.py b/nemo/collections/audio/losses/maxine/losses_combined.py
@@ -18,15 +18,8 @@
 
 import torch
 
-try:
-    from torchaudio.functional import resample
-    from torchaudio.transforms import MelSpectrogram
-
-    HAVE_TORCHAUDIO = True
-except ModuleNotFoundError:
-    HAVE_TORCHAUDIO = False
-
-from nemo.collections.asr.models import ASRModel
+from nemo.collections.asr.models.asr_model import ASRModel
+from nemo.collections.audio.parts.utils.transforms import MelSpectrogram, resample
 from nemo.core import Loss, Typing, typecheck
 from nemo.core.neural_types import LengthsType, LossType, NeuralType, VoidType
 from nemo.utils import logging
@@ -94,12 +87,6 @@ def __init__(
         conformer_model=STT_EN_CONFORMER_CTC_SMALL_v1_6_0,
         epsilon=float(5.9604644775390625e-8),
     ):
-        if not HAVE_TORCHAUDIO:
-            logging.error('Could not import torchaudio. Some features might not work.')
-
-            raise ModuleNotFoundError(
-                f"torchaudio is not installed but is necessary to instantiate a {self.__class__.__name__}"
-            )
 
         super().__init__()
         self.sample_rate = sample_rate