diff --git a/docker/Dockerfile.speech b/docker/Dockerfile.speech
index 9fea05036868..65e55987dc44 100644
--- a/docker/Dockerfile.speech
+++ b/docker/Dockerfile.speech
@@ -22,8 +22,6 @@ ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
 FROM ${BASE_IMAGE} as nemo-deps
 
 # dependency flags; should be declared after FROM
-# torchaudio: not required by default
-ARG REQUIRE_TORCHAUDIO=false
 # k2: not required by default
 ARG REQUIRE_K2=false
 # ais cli: not required by default, install only if required
@@ -96,18 +94,6 @@ WORKDIR /tmp/
 # uninstall stuff from base container
 RUN pip3 uninstall -y sacrebleu torchtext
 
-# build torchaudio
-WORKDIR /tmp/torchaudio_build
-COPY scripts/installers /tmp/torchaudio_build/scripts/installers/
-RUN INSTALL_MSG=$(/bin/bash /tmp/torchaudio_build/scripts/installers/install_torchaudio_latest.sh); INSTALL_CODE=$?; \
-  echo ${INSTALL_MSG}; \
-  if [ ${INSTALL_CODE} -ne 0 ]; then \
-  echo "torchaudio installation failed";  \
-  if [ "${REQUIRE_TORCHAUDIO}" = true ]; then \
-  exit ${INSTALL_CODE};  \
-  else echo "Skipping failed torchaudio installation"; fi \
-  else echo "torchaudio installed successfully"; fi
-
 COPY scripts /tmp/nemo/scripts/
 # install correct graphviz version (k2 and pynini visualization tool), skip if installation fails
 RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_graphviz.sh --docker); INSTALL_CODE=$?; \
diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py
index d34461937284..d77a2644e76c 100644
--- a/examples/audio/process_audio.py
+++ b/examples/audio/process_audio.py
@@ -24,7 +24,7 @@
 import torch
 from omegaconf import OmegaConf
 
-from nemo.collections.audio.models import AudioToAudioModel
+from nemo.collections.audio.models.audio_to_audio import AudioToAudioModel
 from nemo.core.config import hydra_runner
 from nemo.utils import logging, model_utils
 
diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py
index 085fd0e63183..338ca6cd9a67 100644
--- a/nemo/collections/asr/modules/audio_preprocessing.py
+++ b/nemo/collections/asr/modules/audio_preprocessing.py
@@ -19,11 +19,11 @@
 from typing import Any, Optional
 
 import torch
-from packaging import version
 
 from nemo.collections.asr.parts.numba.spec_augment import SpecAugmentNumba, spec_augment_launch_heuristics
-from nemo.collections.asr.parts.preprocessing.features import FilterbankFeatures, FilterbankFeaturesTA
+from nemo.collections.asr.parts.preprocessing.features import FilterbankFeatures
 from nemo.collections.asr.parts.submodules.spectr_augment import SpecAugment, SpecCutout
+from nemo.collections.audio.parts.utils.transforms import MFCC
 from nemo.core.classes import Exportable, NeuralModule, typecheck
 from nemo.core.neural_types import (
     AudioSignal,
@@ -37,18 +37,6 @@
 from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__
 from nemo.utils import logging, logging_mode
 
-try:
-    import torchaudio
-    import torchaudio.functional
-    import torchaudio.transforms
-
-    TORCHAUDIO_VERSION = version.parse(torchaudio.__version__)
-    TORCHAUDIO_VERSION_MIN = version.parse('0.5')
-
-    HAVE_TORCHAUDIO = True
-except ModuleNotFoundError:
-    HAVE_TORCHAUDIO = False
-
 __all__ = [
     'AudioToMelSpectrogramPreprocessor',
     'AudioToMFCCPreprocessor',
@@ -171,7 +159,6 @@ class AudioToMelSpectrogramPreprocessor(AudioPreprocessor, Exportable):
             Defaults to 0.0
         nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation.
             Defaults to 4000
-        use_torchaudio: Whether to use the `torchaudio` implementation.
         mel_norm: Normalization used for mel filterbank weights.
             Defaults to 'slaney' (area normalization)
         stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints.
@@ -237,8 +224,8 @@ def __init__(
         rng=None,
         nb_augmentation_prob=0.0,
         nb_max_freq=4000,
-        use_torchaudio: bool = False,
         mel_norm="slaney",
+        use_torchaudio: bool = False,  # Deprecated arguments; kept for config compatibility
         stft_exact_pad=False,  # Deprecated arguments; kept for config compatibility
         stft_conv=False,  # Deprecated arguments; kept for config compatibility
     ):
@@ -256,11 +243,7 @@ def __init__(
         super().__init__(n_window_size, n_window_stride)
 
         # Given the long and similar argument list, point to the class and instantiate it by reference
-        if not use_torchaudio:
-            featurizer_class = FilterbankFeatures
-        else:
-            featurizer_class = FilterbankFeaturesTA
-        self.featurizer = featurizer_class(
+        self.featurizer = FilterbankFeatures(
             sample_rate=self._sample_rate,
             n_window_size=n_window_size,
             n_window_stride=n_window_stride,
@@ -306,7 +289,6 @@ def filter_banks(self):
 
 class AudioToMFCCPreprocessor(AudioPreprocessor):
     """Preprocessor that converts wavs to MFCCs.
-    Uses torchaudio.transforms.MFCC.
 
     Args:
         sample_rate: The sample rate of the audio.
@@ -382,14 +364,6 @@ def __init__(
         log=True,
     ):
         self._sample_rate = sample_rate
-        if not HAVE_TORCHAUDIO:
-            logging.error('Could not import torchaudio. Some features might not work.')
-
-            raise ModuleNotFoundError(
-                "torchaudio is not installed but is necessary for "
-                "AudioToMFCCPreprocessor. We recommend you try "
-                "building it from source for the PyTorch version you have."
-            )
         if window_size and n_window_size:
             raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
         if window_stride and n_window_stride:
@@ -425,7 +399,7 @@ def __init__(
         mel_kwargs['window_fn'] = window_fn
 
         # Use torchaudio's implementation of MFCCs as featurizer
-        self.featurizer = torchaudio.transforms.MFCC(
+        self.featurizer = MFCC(
             sample_rate=self._sample_rate,
             n_mfcc=n_mfcc,
             dct_type=dct_type,
@@ -746,8 +720,8 @@ class AudioToMelSpectrogramPreprocessorConfig:
     rng: Optional[str] = None
     nb_augmentation_prob: float = 0.0
     nb_max_freq: int = 4000
-    use_torchaudio: bool = False
     mel_norm: str = "slaney"
+    use_torchaudio: bool = False  # Deprecated argument, kept for compatibility with older checkpoints.
     stft_exact_pad: bool = False  # Deprecated argument, kept for compatibility with older checkpoints.
     stft_conv: bool = False  # Deprecated argument, kept for compatibility with older checkpoints.
 
diff --git a/nemo/collections/asr/parts/preprocessing/features.py b/nemo/collections/asr/parts/preprocessing/features.py
index cffc94d276e3..ec0fa8f6f74d 100644
--- a/nemo/collections/asr/parts/preprocessing/features.py
+++ b/nemo/collections/asr/parts/preprocessing/features.py
@@ -34,7 +34,6 @@
 # This file contains code artifacts adapted from https://github.com/ryanleary/patter
 import math
 import random
-from typing import Optional, Tuple, Union
 
 import librosa
 import numpy as np
@@ -45,14 +44,6 @@
 from nemo.collections.asr.parts.preprocessing.segment import AudioSegment
 from nemo.utils import logging
 
-try:
-    import torchaudio
-
-    HAVE_TORCHAUDIO = True
-except ModuleNotFoundError:
-    HAVE_TORCHAUDIO = False
-
-
 CONSTANT = 1e-5
 
 
@@ -499,187 +490,3 @@ def forward(self, x, seq_len, linear_spec=False):
             if pad_amt != 0:
                 x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value)
         return x, seq_len
-
-
-class FilterbankFeaturesTA(nn.Module):
-    """
-    Exportable, `torchaudio`-based implementation of Mel Spectrogram extraction.
-
-    See `AudioToMelSpectrogramPreprocessor` for args.
-
-    """
-
-    def __init__(
-        self,
-        sample_rate: int = 16000,
-        n_window_size: int = 320,
-        n_window_stride: int = 160,
-        normalize: Optional[str] = "per_feature",
-        nfilt: int = 64,
-        n_fft: Optional[int] = None,
-        preemph: float = 0.97,
-        lowfreq: float = 0,
-        highfreq: Optional[float] = None,
-        log: bool = True,
-        log_zero_guard_type: str = "add",
-        log_zero_guard_value: Union[float, str] = 2**-24,
-        dither: float = 1e-5,
-        window: str = "hann",
-        pad_to: int = 0,
-        pad_value: float = 0.0,
-        mel_norm="slaney",
-        # Seems like no one uses these options anymore. Don't convolute the code by supporting thm.
-        use_grads: bool = False,  # Deprecated arguments; kept for config compatibility
-        max_duration: float = 16.7,  # Deprecated arguments; kept for config compatibility
-        frame_splicing: int = 1,  # Deprecated arguments; kept for config compatibility
-        exact_pad: bool = False,  # Deprecated arguments; kept for config compatibility
-        nb_augmentation_prob: float = 0.0,  # Deprecated arguments; kept for config compatibility
-        nb_max_freq: int = 4000,  # Deprecated arguments; kept for config compatibility
-        mag_power: float = 2.0,  # Deprecated arguments; kept for config compatibility
-        rng: Optional[random.Random] = None,  # Deprecated arguments; kept for config compatibility
-        stft_exact_pad: bool = False,  # Deprecated arguments; kept for config compatibility
-        stft_conv: bool = False,  # Deprecated arguments; kept for config compatibility
-    ):
-        super().__init__()
-        if not HAVE_TORCHAUDIO:
-            raise ValueError(f"Need to install torchaudio to instantiate a {self.__class__.__name__}")
-
-        # Make sure log zero guard is supported, if given as a string
-        supported_log_zero_guard_strings = {"eps", "tiny"}
-        if isinstance(log_zero_guard_value, str) and log_zero_guard_value not in supported_log_zero_guard_strings:
-            raise ValueError(
-                f"Log zero guard value must either be a float or a member of {supported_log_zero_guard_strings}"
-            )
-
-        # Copied from `AudioPreprocessor` due to the ad-hoc structuring of the Mel Spec extractor class
-        self.torch_windows = {
-            'hann': torch.hann_window,
-            'hamming': torch.hamming_window,
-            'blackman': torch.blackman_window,
-            'bartlett': torch.bartlett_window,
-            'ones': torch.ones,
-            None: torch.ones,
-        }
-
-        # Ensure we can look up the window function
-        if window not in self.torch_windows:
-            raise ValueError(f"Got window value '{window}' but expected a member of {self.torch_windows.keys()}")
-
-        self.win_length = n_window_size
-        self.hop_length = n_window_stride
-        self._sample_rate = sample_rate
-        self._normalize_strategy = normalize
-        self._use_log = log
-        self._preemphasis_value = preemph
-        self.log_zero_guard_type = log_zero_guard_type
-        self.log_zero_guard_value: Union[str, float] = log_zero_guard_value
-        self.dither = dither
-        self.pad_to = pad_to
-        self.pad_value = pad_value
-        self.n_fft = n_fft
-        self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = torchaudio.transforms.MelSpectrogram(
-            sample_rate=self._sample_rate,
-            win_length=self.win_length,
-            hop_length=self.hop_length,
-            n_mels=nfilt,
-            window_fn=self.torch_windows[window],
-            mel_scale="slaney",
-            norm=mel_norm,
-            n_fft=n_fft,
-            f_max=highfreq,
-            f_min=lowfreq,
-            wkwargs={"periodic": False},
-        )
-
-    @property
-    def filter_banks(self):
-        """Matches the analogous class"""
-        return self._mel_spec_extractor.mel_scale.fb
-
-    def _resolve_log_zero_guard_value(self, dtype: torch.dtype) -> float:
-        if isinstance(self.log_zero_guard_value, float):
-            return self.log_zero_guard_value
-        return getattr(torch.finfo(dtype), self.log_zero_guard_value)
-
-    def _apply_dithering(self, signals: torch.Tensor) -> torch.Tensor:
-        if self.training and self.dither > 0.0:
-            noise = torch.randn_like(signals) * self.dither
-            signals = signals + noise
-        return signals
-
-    def _apply_preemphasis(self, signals: torch.Tensor) -> torch.Tensor:
-        if self._preemphasis_value is not None:
-            padded = torch.nn.functional.pad(signals, (1, 0))
-            signals = signals - self._preemphasis_value * padded[:, :-1]
-        return signals
-
-    def _compute_output_lengths(self, input_lengths: torch.Tensor) -> torch.Tensor:
-        out_lengths = input_lengths.div(self.hop_length, rounding_mode="floor").add(1).long()
-        return out_lengths
-
-    def _apply_pad_to(self, features: torch.Tensor) -> torch.Tensor:
-        # Only apply during training; else need to capture dynamic shape for exported models
-        if not self.training or self.pad_to == 0 or features.shape[-1] % self.pad_to == 0:
-            return features
-        pad_length = self.pad_to - (features.shape[-1] % self.pad_to)
-        return torch.nn.functional.pad(features, pad=(0, pad_length), value=self.pad_value)
-
-    def _apply_log(self, features: torch.Tensor) -> torch.Tensor:
-        if self._use_log:
-            zero_guard = self._resolve_log_zero_guard_value(features.dtype)
-            if self.log_zero_guard_type == "add":
-                features = features + zero_guard
-            elif self.log_zero_guard_type == "clamp":
-                features = features.clamp(min=zero_guard)
-            else:
-                raise ValueError(f"Unsupported log zero guard type: '{self.log_zero_guard_type}'")
-            features = features.log()
-        return features
-
-    def _extract_spectrograms(self, signals: torch.Tensor) -> torch.Tensor:
-        # Complex FFT needs to be done in single precision
-        with torch.amp.autocast('cuda', enabled=False):
-            features = self._mel_spec_extractor(waveform=signals)
-        return features
-
-    def _apply_normalization(self, features: torch.Tensor, lengths: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
-        # For consistency, this function always does a masked fill even if not normalizing.
-        mask: torch.Tensor = make_seq_mask_like(lengths=lengths, like=features, time_dim=-1, valid_ones=False)
-        features = features.masked_fill(mask, 0.0)
-        # Maybe don't normalize
-        if self._normalize_strategy is None:
-            return features
-        # Use the log zero guard for the sqrt zero guard
-        guard_value = self._resolve_log_zero_guard_value(features.dtype)
-        if self._normalize_strategy == "per_feature" or self._normalize_strategy == "all_features":
-            # 'all_features' reduces over each sample; 'per_feature' reduces over each channel
-            reduce_dim = 2
-            if self._normalize_strategy == "all_features":
-                reduce_dim = [1, 2]
-            # [B, D, T] -> [B, D, 1] or [B, 1, 1]
-            means = features.sum(dim=reduce_dim, keepdim=True).div(lengths.view(-1, 1, 1))
-            stds = (
-                features.sub(means)
-                .masked_fill(mask, 0.0)
-                .pow(2.0)
-                .sum(dim=reduce_dim, keepdim=True)  # [B, D, T] -> [B, D, 1] or [B, 1, 1]
-                .div(lengths.view(-1, 1, 1) - 1)  # assume biased estimator
-                .clamp(min=guard_value)  # avoid sqrt(0)
-                .sqrt()
-            )
-            features = (features - means) / (stds + eps)
-        else:
-            # Deprecating constant std/mean
-            raise ValueError(f"Unsupported norm type: '{self._normalize_strategy}")
-        features = features.masked_fill(mask, 0.0)
-        return features
-
-    def forward(self, input_signal: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        feature_lengths = self._compute_output_lengths(input_lengths=length)
-        signals = self._apply_dithering(signals=input_signal)
-        signals = self._apply_preemphasis(signals=signals)
-        features = self._extract_spectrograms(signals=signals)
-        features = self._apply_log(features=features)
-        features = self._apply_normalization(features=features, lengths=feature_lengths)
-        features = self._apply_pad_to(features=features)
-        return features, feature_lengths
diff --git a/nemo/collections/audio/losses/__init__.py b/nemo/collections/audio/losses/__init__.py
index 00db9e62bc33..341a77c5bc66 100644
--- a/nemo/collections/audio/losses/__init__.py
+++ b/nemo/collections/audio/losses/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from nemo.collections.audio.losses.audio import MAELoss, MSELoss, SDRLoss
diff --git a/nemo/collections/audio/losses/maxine/losses_combined.py b/nemo/collections/audio/losses/maxine/losses_combined.py
index 1ea46aaaca12..95d95fe64ae8 100644
--- a/nemo/collections/audio/losses/maxine/losses_combined.py
+++ b/nemo/collections/audio/losses/maxine/losses_combined.py
@@ -18,15 +18,8 @@
 
 import torch
 
-try:
-    from torchaudio.functional import resample
-    from torchaudio.transforms import MelSpectrogram
-
-    HAVE_TORCHAUDIO = True
-except ModuleNotFoundError:
-    HAVE_TORCHAUDIO = False
-
-from nemo.collections.asr.models import ASRModel
+from nemo.collections.asr.models.asr_model import ASRModel
+from nemo.collections.audio.parts.utils.transforms import MelSpectrogram, resample
 from nemo.core import Loss, Typing, typecheck
 from nemo.core.neural_types import LengthsType, LossType, NeuralType, VoidType
 from nemo.utils import logging
@@ -94,12 +87,6 @@ def __init__(
         conformer_model=STT_EN_CONFORMER_CTC_SMALL_v1_6_0,
         epsilon=float(5.9604644775390625e-8),
     ):
-        if not HAVE_TORCHAUDIO:
-            logging.error('Could not import torchaudio. Some features might not work.')
-
-            raise ModuleNotFoundError(
-                f"torchaudio is not installed but is necessary to instantiate a {self.__class__.__name__}"
-            )
 
         super().__init__()
         self.sample_rate = sample_rate
diff --git a/nemo/collections/audio/models/__init__.py b/nemo/collections/audio/models/__init__.py
index 4e743cd8c82b..341a77c5bc66 100644
--- a/nemo/collections/audio/models/__init__.py
+++ b/nemo/collections/audio/models/__init__.py
@@ -11,12 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from nemo.collections.audio.models.audio_to_audio import AudioToAudioModel
-from nemo.collections.audio.models.enhancement import (
-    EncMaskDecAudioToAudioModel,
-    FlowMatchingAudioToAudioModel,
-    PredictiveAudioToAudioModel,
-    SchroedingerBridgeAudioToAudioModel,
-    ScoreBasedGenerativeAudioToAudioModel,
-)
diff --git a/nemo/collections/audio/parts/utils/resampling.py b/nemo/collections/audio/parts/utils/resampling.py
deleted file mode 100644
index 8b82ccf04a72..000000000000
--- a/nemo/collections/audio/parts/utils/resampling.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# NOTE: The code below originates from torchaudio repository, version 2.6.0.
-#       It can be found under: https://github.com/pytorch/audio/tree/release/2.6
-#       The modifications applied are mostly cosmetic.
-#       The inclusion of this code in NeMo allows us to avoid
-#       a dependency with a problematic build process.
-#       This code is licensed under the BSD 2-Clause License,
-#       included verbatim from the torchaudio repository below:
-#
-# BSD 2-Clause License
-#
-# Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import math
-from typing import Optional
-
-import torch
-
-
-class Resample(torch.nn.Module):
-    r"""Resample a signal from one frequency to another. A resampling method can be given.
-
-    .. devices:: CPU CUDA
-
-    .. properties:: Autograd TorchScript
-
-    Note:
-        If resampling on waveforms of higher precision than float32, there may be a small loss of precision
-        because the kernel is cached once as float32. If high precision resampling is important for your application,
-        the functional form will retain higher precision, but run slower because it does not cache the kernel.
-        Alternatively, you could rewrite a transform that caches a higher precision kernel.
-
-    Args:
-        orig_freq (int, optional): The original frequency of the signal. (Default: ``16000``)
-        new_freq (int, optional): The desired frequency. (Default: ``16000``)
-        resampling_method (str, optional): The resampling method to use.
-            Options: [``sinc_interp_hann``, ``sinc_interp_kaiser``] (Default: ``"sinc_interp_hann"``)
-        lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
-            but less efficient. (Default: ``6``)
-        rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
-            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
-        beta (float or None, optional): The shape parameter used for kaiser window.
-        dtype (torch.device, optional):
-            Determnines the precision that resampling kernel is pre-computed and cached. If not provided,
-            kernel is computed with ``torch.float64`` then cached as ``torch.float32``.
-            If you need higher precision, provide ``torch.float64``, and the pre-computed kernel is computed and
-            cached as ``torch.float64``. If you use resample with lower precision, then instead of providing this
-            providing this argument, please use ``Resample.to(dtype)``, so that the kernel generation is still
-            carried out on ``torch.float64``.
-
-    Example
-        >>> waveform, sample_rate = ...
-        >>> transform = transforms.Resample(sample_rate, sample_rate/10)
-        >>> waveform = transform(waveform)
-    """
-
-    def __init__(
-        self,
-        orig_freq: int = 16000,
-        new_freq: int = 16000,
-        resampling_method: str = "sinc_interp_hann",
-        lowpass_filter_width: int = 6,
-        rolloff: float = 0.99,
-        beta: Optional[float] = None,
-        *,
-        dtype: Optional[torch.dtype] = None,
-    ) -> None:
-        super().__init__()
-
-        self.orig_freq = orig_freq
-        self.new_freq = new_freq
-        self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq))
-        self.resampling_method = resampling_method
-        self.lowpass_filter_width = lowpass_filter_width
-        self.rolloff = rolloff
-        self.beta = beta
-
-        if self.orig_freq != self.new_freq:
-            kernel, self.width = _get_sinc_resample_kernel(
-                self.orig_freq,
-                self.new_freq,
-                self.gcd,
-                self.lowpass_filter_width,
-                self.rolloff,
-                self.resampling_method,
-                beta,
-                dtype=dtype,
-            )
-            self.register_buffer("kernel", kernel)
-
-    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
-        r"""
-        Args:
-            waveform (Tensor): Tensor of audio of dimension (..., time).
-
-        Returns:
-            Tensor: Output signal of dimension (..., time).
-        """
-        if self.orig_freq == self.new_freq:
-            return waveform
-        return _apply_sinc_resample_kernel(waveform, self.orig_freq, self.new_freq, self.gcd, self.kernel, self.width)
-
-
-def resample(
-    waveform: torch.Tensor,
-    orig_freq: int,
-    new_freq: int,
-    lowpass_filter_width: int = 6,
-    rolloff: float = 0.99,
-    resampling_method: str = "sinc_interp_hann",
-    beta: Optional[float] = None,
-) -> torch.Tensor:
-    r"""Resamples the waveform at the new frequency using bandlimited interpolation. :cite:`RESAMPLE`.
-
-    .. devices:: CPU CUDA
-
-    .. properties:: Autograd TorchScript
-
-    Note:
-        ``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in
-        more efficient computation if resampling multiple waveforms with the same resampling parameters.
-
-    Args:
-        waveform (Tensor): The input signal of dimension `(..., time)`
-        orig_freq (int): The original frequency of the signal
-        new_freq (int): The desired frequency
-        lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
-            but less efficient. (Default: ``6``)
-        rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
-            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
-        resampling_method (str, optional): The resampling method to use.
-            Options: [``"sinc_interp_hann"``, ``"sinc_interp_kaiser"``] (Default: ``"sinc_interp_hann"``)
-        beta (float or None, optional): The shape parameter used for kaiser window.
-
-    Returns:
-        Tensor: The waveform at the new frequency of dimension `(..., time).`
-    """
-
-    if orig_freq <= 0.0 or new_freq <= 0.0:
-        raise ValueError("Original frequency and desired frequecy should be positive")
-
-    if orig_freq == new_freq:
-        return waveform
-
-    gcd = math.gcd(int(orig_freq), int(new_freq))
-
-    kernel, width = _get_sinc_resample_kernel(
-        orig_freq,
-        new_freq,
-        gcd,
-        lowpass_filter_width,
-        rolloff,
-        resampling_method,
-        beta,
-        waveform.device,
-        waveform.dtype,
-    )
-    resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width)
-    return resampled
-
-
-def _get_sinc_resample_kernel(
-    orig_freq: int,
-    new_freq: int,
-    gcd: int,
-    lowpass_filter_width: int = 6,
-    rolloff: float = 0.99,
-    resampling_method: str = "sinc_interp_hann",
-    beta: Optional[float] = None,
-    device: torch.device = "cpu",
-    dtype: Optional[torch.dtype] = None,
-):
-    if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq):
-        raise Exception(
-            "Frequencies must be of integer type to ensure quality resampling computation. "
-            "To work around this, manually convert both frequencies to integer values "
-            "that maintain their resampling rate ratio before passing them into the function. "
-            "Example: To downsample a 44100 hz waveform by a factor of 8, use "
-            "`orig_freq=8` and `new_freq=1` instead of `orig_freq=44100` and `new_freq=5512.5`. "
-            "For more information, please refer to https://github.com/pytorch/audio/issues/1487."
-        )
-
-    if resampling_method not in ["sinc_interp_hann", "sinc_interp_kaiser"]:
-        raise ValueError("Invalid resampling method: {}".format(resampling_method))
-
-    orig_freq = int(orig_freq) // gcd
-    new_freq = int(new_freq) // gcd
-
-    if lowpass_filter_width <= 0:
-        raise ValueError("Low pass filter width should be positive.")
-    base_freq = min(orig_freq, new_freq)
-    # This will perform antialiasing filtering by removing the highest frequencies.
-    # At first I thought I only needed this when downsampling, but when upsampling
-    # you will get edge artifacts without this, as the edge is equivalent to zero padding,
-    # which will add high freq artifacts.
-    base_freq *= rolloff
-
-    # The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor)
-    # using the sinc interpolation formula:
-    #   x(t) = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - t))
-    # We can then sample the function x(t) with a different sample rate:
-    #    y[j] = x(j / new_freq)
-    # or,
-    #    y[j] = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
-
-    # We see here that y[j] is the convolution of x[i] with a specific filter, for which
-    # we take an FIR approximation, stopping when we see at least `lowpass_filter_width` zeros crossing.
-    # But y[j+1] is going to have a different set of weights and so on, until y[j + new_freq].
-    # Indeed:
-    # y[j + new_freq] = sum_i x[i] sinc(pi * orig_freq * ((i / orig_freq - (j + new_freq) / new_freq))
-    #                 = sum_i x[i] sinc(pi * orig_freq * ((i - orig_freq) / orig_freq - j / new_freq))
-    #                 = sum_i x[i + orig_freq] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
-    # so y[j+new_freq] uses the same filter as y[j], but on a shifted version of x by `orig_freq`.
-    # This will explain the F.conv1d after, with a stride of orig_freq.
-    width = math.ceil(lowpass_filter_width * orig_freq / base_freq)
-    # If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e.,
-    # they will have a lot of almost zero values to the left or to the right...
-    # There is probably a way to evaluate those filters more efficiently, but this is kept for
-    # future work.
-    idx_dtype = dtype if dtype is not None else torch.float64
-
-    idx = torch.arange(-width, width + orig_freq, dtype=idx_dtype, device=device)[None, None] / orig_freq
-
-    t = torch.arange(0, -new_freq, -1, dtype=dtype, device=device)[:, None, None] / new_freq + idx
-    t *= base_freq
-    t = t.clamp_(-lowpass_filter_width, lowpass_filter_width)
-
-    # we do not use built in torch windows here as we need to evaluate the window
-    # at specific positions, not over a regular grid.
-    if resampling_method == "sinc_interp_hann":
-        window = torch.cos(t * math.pi / lowpass_filter_width / 2) ** 2
-    else:
-        # sinc_interp_kaiser
-        if beta is None:
-            beta = 14.769656459379492
-        beta_tensor = torch.tensor(float(beta))
-        window = torch.i0(beta_tensor * torch.sqrt(1 - (t / lowpass_filter_width) ** 2)) / torch.i0(beta_tensor)
-
-    t *= math.pi
-
-    scale = base_freq / orig_freq
-    kernels = torch.where(t == 0, torch.tensor(1.0).to(t), t.sin() / t)
-    kernels *= window * scale
-
-    if dtype is None:
-        kernels = kernels.to(dtype=torch.float32)
-
-    return kernels, width
-
-
-def _apply_sinc_resample_kernel(
-    waveform: torch.Tensor,
-    orig_freq: int,
-    new_freq: int,
-    gcd: int,
-    kernel: torch.Tensor,
-    width: int,
-):
-    if not waveform.is_floating_point():
-        raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.")
-
-    orig_freq = int(orig_freq) // gcd
-    new_freq = int(new_freq) // gcd
-
-    # pack batch
-    shape = waveform.size()
-    waveform = waveform.view(-1, shape[-1])
-
-    num_wavs, length = waveform.shape
-    waveform = torch.nn.functional.pad(waveform, (width, width + orig_freq))
-    resampled = torch.nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq)
-    resampled = resampled.transpose(1, 2).reshape(num_wavs, -1)
-    target_length = torch.ceil(torch.as_tensor(new_freq * length / orig_freq)).long()
-    resampled = resampled[..., :target_length]
-
-    # unpack batch
-    resampled = resampled.view(shape[:-1] + resampled.shape[-1:])
-    return resampled
diff --git a/nemo/collections/audio/parts/utils/transforms.py b/nemo/collections/audio/parts/utils/transforms.py
new file mode 100644
index 000000000000..6f7f91479904
--- /dev/null
+++ b/nemo/collections/audio/parts/utils/transforms.py
@@ -0,0 +1,1105 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# NOTE: The code below originates from torchaudio repository, version 2.9.
+#       It can be found under: https://github.com/pytorch/audio/tree/release/2.9
+#       The modifications applied are mostly cosmetic.
+#       The inclusion of this code in NeMo allows us to avoid
+#       a dependency with a problematic build process.
+#       This code is licensed under the BSD 2-Clause License,
+#       included verbatim from the torchaudio repository below:
+#
+# BSD 2-Clause License
+#
+# Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import math
+import warnings
+from typing import Callable, Optional, Union
+
+import torch
+from torch import Tensor
+
+__all__ = ["Spectrogram", "MelSpectrogram", "MFCC", "Resample"]
+
+
+class Spectrogram(torch.nn.Module):
+    r"""Create a spectrogram from a audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float or None, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
+            If None, then the complex spectrum is returned instead. (Default: ``2``)
+        normalized (bool or str, optional): Whether to normalize by magnitude after stft. If input is str, choices are
+            ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
+            ``"window"``. (Default: ``False``)
+        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy (Default: ``True``)
+        return_complex (bool, optional):
+            Deprecated and not used.
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = torchaudio.transforms.Spectrogram(n_fft=800)
+        >>> spectrogram = transform(waveform)
+
+    """
+
+    __constants__ = ["n_fft", "win_length", "hop_length", "pad", "power", "normalized"]
+
+    def __init__(
+        self,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        pad: int = 0,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        power: Optional[float] = 2.0,
+        normalized: Union[bool, str] = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        onesided: bool = True,
+        return_complex: Optional[bool] = None,
+    ) -> None:
+        super().__init__()
+        self.n_fft = n_fft
+        # number of FFT bins. the returned STFT result will have n_fft // 2 + 1
+        # number of frequencies due to onesided=True in torch.stft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        self.pad = pad
+        self.power = power
+        self.normalized = normalized
+        self.center = center
+        self.pad_mode = pad_mode
+        self.onesided = onesided
+        if return_complex is not None:
+            warnings.warn(
+                "`return_complex` argument is now deprecated and is not effective."
+                "`torchaudio.transforms.Spectrogram(power=None)` always returns a tensor with "
+                "complex dtype. Please remove the argument in the function call."
+            )
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Dimension (..., freq, time), where freq is
+            ``n_fft // 2 + 1`` where ``n_fft`` is the number of
+            Fourier bins, and time is the number of window hops (n_frame).
+        """
+        return spectrogram(
+            waveform,
+            self.pad,
+            self.window,
+            self.n_fft,
+            self.hop_length,
+            self.win_length,
+            self.power,
+            self.normalized,
+            self.center,
+            self.pad_mode,
+            self.onesided,
+        )
+
+
+class MelSpectrogram(torch.nn.Module):
+    r"""Create MelSpectrogram for a raw audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This is a composition of :py:func:`torchaudio.transforms.Spectrogram`
+    and :py:func:`torchaudio.transforms.MelScale`.
+
+    Sources
+        * https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe
+        * https://timsainb.github.io/spectrograms-mfccs-and-inversion-in-python.html
+        * http://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``None``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
+        normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
+        wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        onesided: Deprecated and unused.
+        norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.MelSpectrogram(sample_rate)
+        >>> mel_specgram = transform(waveform)  # (channel, n_mels, time)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+
+    __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_mels", "f_min"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        pad: int = 0,
+        n_mels: int = 128,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        power: float = 2.0,
+        normalized: bool = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        onesided: Optional[bool] = None,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+    ) -> None:
+        super(MelSpectrogram, self).__init__()
+
+        if onesided is not None:
+            warnings.warn(
+                "Argument 'onesided' has been deprecated and has no influence on the behavior of this module."
+            )
+
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        self.pad = pad
+        self.power = power
+        self.normalized = normalized
+        self.n_mels = n_mels  # number of mel frequency bins
+        self.f_max = f_max
+        self.f_min = f_min
+        self.spectrogram = Spectrogram(
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            pad=self.pad,
+            window_fn=window_fn,
+            power=self.power,
+            normalized=self.normalized,
+            wkwargs=wkwargs,
+            center=center,
+            pad_mode=pad_mode,
+            onesided=True,
+        )
+        self.mel_scale = MelScale(
+            self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, norm, mel_scale
+        )
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
+        """
+        specgram = self.spectrogram(waveform)
+        mel_specgram = self.mel_scale(specgram)
+        return mel_specgram
+
+
+class MFCC(torch.nn.Module):
+    r"""Create the Mel-frequency cepstrum coefficients from an audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    By default, this calculates the MFCC on the DB-scaled Mel spectrogram.
+    This is not the textbook implementation, but is implemented here to
+    give consistency with librosa.
+
+    This output depends on the maximum value in the input spectrogram, and so
+    may return different values for an audio clip split into snippets vs. a
+    a full clip.
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_mfcc (int, optional): Number of mfc coefficients to retain. (Default: ``40``)
+        dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``)
+        norm (str, optional): norm to use. (Default: ``"ortho"``)
+        log_mels (bool, optional): whether to use log-mel spectrograms instead of db-scaled. (Default: ``False``)
+        melkwargs (dict or None, optional): arguments for MelSpectrogram. (Default: ``None``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.MFCC(
+        >>>     sample_rate=sample_rate,
+        >>>     n_mfcc=13,
+        >>>     melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23, "center": False},
+        >>> )
+        >>> mfcc = transform(waveform)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+
+    __constants__ = ["sample_rate", "n_mfcc", "dct_type", "top_db", "log_mels"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_mfcc: int = 40,
+        dct_type: int = 2,
+        norm: str = "ortho",
+        log_mels: bool = False,
+        melkwargs: Optional[dict] = None,
+    ) -> None:
+        super(MFCC, self).__init__()
+        supported_dct_types = [2]
+        if dct_type not in supported_dct_types:
+            raise ValueError("DCT type not supported: {}".format(dct_type))
+        self.sample_rate = sample_rate
+        self.n_mfcc = n_mfcc
+        self.dct_type = dct_type
+        self.norm = norm
+        self.top_db = 80.0
+        self.amplitude_to_DB = AmplitudeToDB("power", self.top_db)
+
+        melkwargs = melkwargs or {}
+        self.MelSpectrogram = MelSpectrogram(sample_rate=self.sample_rate, **melkwargs)
+
+        if self.n_mfcc > self.MelSpectrogram.n_mels:
+            raise ValueError("Cannot select more MFCC coefficients than # mel bins")
+        dct_mat = create_dct(self.n_mfcc, self.MelSpectrogram.n_mels, self.norm)
+        self.register_buffer("dct_mat", dct_mat)
+        self.log_mels = log_mels
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: specgram_mel_db of size (..., ``n_mfcc``, time).
+        """
+        mel_specgram = self.MelSpectrogram(waveform)
+        if self.log_mels:
+            log_offset = 1e-6
+            mel_specgram = torch.log(mel_specgram + log_offset)
+        else:
+            mel_specgram = self.amplitude_to_DB(mel_specgram)
+
+        # (..., time, n_mels) dot (n_mels, n_mfcc) -> (..., n_nfcc, time)
+        mfcc = torch.matmul(mel_specgram.transpose(-1, -2), self.dct_mat).transpose(-1, -2)
+        return mfcc
+
+
+class Resample(torch.nn.Module):
+    r"""Resample a signal from one frequency to another. A resampling method can be given.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        If resampling on waveforms of higher precision than float32, there may be a small loss of precision
+        because the kernel is cached once as float32. If high precision resampling is important for your application,
+        the functional form will retain higher precision, but run slower because it does not cache the kernel.
+        Alternatively, you could rewrite a transform that caches a higher precision kernel.
+
+    Args:
+        orig_freq (int, optional): The original frequency of the signal. (Default: ``16000``)
+        new_freq (int, optional): The desired frequency. (Default: ``16000``)
+        resampling_method (str, optional): The resampling method to use.
+            Options: [``sinc_interp_hann``, ``sinc_interp_kaiser``] (Default: ``"sinc_interp_hann"``)
+        lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
+            but less efficient. (Default: ``6``)
+        rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
+            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
+        beta (float or None, optional): The shape parameter used for kaiser window.
+        dtype (torch.device, optional):
+            Determnines the precision that resampling kernel is pre-computed and cached. If not provided,
+            kernel is computed with ``torch.float64`` then cached as ``torch.float32``.
+            If you need higher precision, provide ``torch.float64``, and the pre-computed kernel is computed and
+            cached as ``torch.float64``. If you use resample with lower precision, then instead of providing this
+            providing this argument, please use ``Resample.to(dtype)``, so that the kernel generation is still
+            carried out on ``torch.float64``.
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.Resample(sample_rate, sample_rate/10)
+        >>> waveform = transform(waveform)
+    """
+
+    def __init__(
+        self,
+        orig_freq: int = 16000,
+        new_freq: int = 16000,
+        resampling_method: str = "sinc_interp_hann",
+        lowpass_filter_width: int = 6,
+        rolloff: float = 0.99,
+        beta: Optional[float] = None,
+        *,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__()
+
+        self.orig_freq = orig_freq
+        self.new_freq = new_freq
+        self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq))
+        self.resampling_method = resampling_method
+        self.lowpass_filter_width = lowpass_filter_width
+        self.rolloff = rolloff
+        self.beta = beta
+
+        if self.orig_freq != self.new_freq:
+            kernel, self.width = _get_sinc_resample_kernel(
+                self.orig_freq,
+                self.new_freq,
+                self.gcd,
+                self.lowpass_filter_width,
+                self.rolloff,
+                self.resampling_method,
+                beta,
+                dtype=dtype,
+            )
+            self.register_buffer("kernel", kernel)
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Output signal of dimension (..., time).
+        """
+        if self.orig_freq == self.new_freq:
+            return waveform
+        return _apply_sinc_resample_kernel(waveform, self.orig_freq, self.new_freq, self.gcd, self.kernel, self.width)
+
+
+class MelScale(torch.nn.Module):
+    r"""Turn a normal STFT into a mel frequency STFT with triangular filter banks.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
+        norm (str or None, optional): If ``"slaney"``, divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
+        >>> spectrogram = spectrogram_transform(waveform)
+        >>> melscale_transform = transforms.MelScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
+        >>> melscale_spectrogram = melscale_transform(spectrogram)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+
+    __constants__ = ["n_mels", "sample_rate", "f_min", "f_max"]
+
+    def __init__(
+        self,
+        n_mels: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        n_stft: int = 201,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+    ) -> None:
+        super(MelScale, self).__init__()
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.f_max = f_max if f_max is not None else float(sample_rate // 2)
+        self.f_min = f_min
+        self.norm = norm
+        self.mel_scale = mel_scale
+
+        if f_min > self.f_max:
+            raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
+
+        fb = melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, self.norm, self.mel_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).
+
+        Returns:
+            Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
+        """
+
+        # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
+        mel_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
+
+        return mel_specgram
+
+
+class AmplitudeToDB(torch.nn.Module):
+    r"""Turn a tensor from the power/amplitude scale to the decibel scale.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This output depends on the maximum value in the input tensor, and so
+    may return different values for an audio clip split into snippets vs. a
+    a full clip.
+
+    Args:
+        stype (str, optional): scale of input tensor (``"power"`` or ``"magnitude"``). The
+            power being the elementwise square of the magnitude. (Default: ``"power"``)
+        top_db (float or None, optional): minimum negative cut-off in decibels.  A reasonable
+            number is 80. (Default: ``None``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.AmplitudeToDB(stype="amplitude", top_db=80)
+        >>> waveform_db = transform(waveform)
+    """
+
+    __constants__ = ["multiplier", "amin", "ref_value", "db_multiplier"]
+
+    def __init__(self, stype: str = "power", top_db: Optional[float] = None) -> None:
+        super(AmplitudeToDB, self).__init__()
+        self.stype = stype
+        if top_db is not None and top_db < 0:
+            raise ValueError("top_db must be positive value")
+        self.top_db = top_db
+        self.multiplier = 10.0 if stype == "power" else 20.0
+        self.amin = 1e-10
+        self.ref_value = 1.0
+        self.db_multiplier = math.log10(max(self.amin, self.ref_value))
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Numerically stable implementation from Librosa.
+
+        https://librosa.org/doc/latest/generated/librosa.amplitude_to_db.html
+
+        Args:
+            x (Tensor): Input tensor before being converted to decibel scale.
+
+        Returns:
+            Tensor: Output tensor in decibel scale.
+        """
+        return amplitude_to_DB(x, self.multiplier, self.amin, self.db_multiplier, self.top_db)
+
+
+def resample(
+    waveform: Tensor,
+    orig_freq: int,
+    new_freq: int,
+    lowpass_filter_width: int = 6,
+    rolloff: float = 0.99,
+    resampling_method: str = "sinc_interp_hann",
+    beta: Optional[float] = None,
+) -> Tensor:
+    r"""Resamples the waveform at the new frequency using bandlimited interpolation. :cite:`RESAMPLE`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        ``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in
+        more efficient computation if resampling multiple waveforms with the same resampling parameters.
+
+    Args:
+        waveform (Tensor): The input signal of dimension `(..., time)`
+        orig_freq (int): The original frequency of the signal
+        new_freq (int): The desired frequency
+        lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
+            but less efficient. (Default: ``6``)
+        rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
+            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
+        resampling_method (str, optional): The resampling method to use.
+            Options: [``"sinc_interp_hann"``, ``"sinc_interp_kaiser"``] (Default: ``"sinc_interp_hann"``)
+        beta (float or None, optional): The shape parameter used for kaiser window.
+
+    Returns:
+        Tensor: The waveform at the new frequency of dimension `(..., time).`
+    """
+
+    if orig_freq <= 0.0 or new_freq <= 0.0:
+        raise ValueError("Original frequency and desired frequecy should be positive")
+
+    if orig_freq == new_freq:
+        return waveform
+
+    gcd = math.gcd(int(orig_freq), int(new_freq))
+
+    kernel, width = _get_sinc_resample_kernel(
+        orig_freq,
+        new_freq,
+        gcd,
+        lowpass_filter_width,
+        rolloff,
+        resampling_method,
+        beta,
+        waveform.device,
+        waveform.dtype,
+    )
+    resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width)
+    return resampled
+
+
+def _get_sinc_resample_kernel(
+    orig_freq: int,
+    new_freq: int,
+    gcd: int,
+    lowpass_filter_width: int = 6,
+    rolloff: float = 0.99,
+    resampling_method: str = "sinc_interp_hann",
+    beta: Optional[float] = None,
+    device: torch.device = "cpu",
+    dtype: Optional[torch.dtype] = None,
+):
+    if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq):
+        raise Exception(
+            "Frequencies must be of integer type to ensure quality resampling computation. "
+            "To work around this, manually convert both frequencies to integer values "
+            "that maintain their resampling rate ratio before passing them into the function. "
+            "Example: To downsample a 44100 hz waveform by a factor of 8, use "
+            "`orig_freq=8` and `new_freq=1` instead of `orig_freq=44100` and `new_freq=5512.5`. "
+            "For more information, please refer to https://github.com/pytorch/audio/issues/1487."
+        )
+
+    if resampling_method not in ["sinc_interp_hann", "sinc_interp_kaiser"]:
+        raise ValueError("Invalid resampling method: {}".format(resampling_method))
+
+    orig_freq = int(orig_freq) // gcd
+    new_freq = int(new_freq) // gcd
+
+    if lowpass_filter_width <= 0:
+        raise ValueError("Low pass filter width should be positive.")
+    base_freq = min(orig_freq, new_freq)
+    # This will perform antialiasing filtering by removing the highest frequencies.
+    # At first I thought I only needed this when downsampling, but when upsampling
+    # you will get edge artifacts without this, as the edge is equivalent to zero padding,
+    # which will add high freq artifacts.
+    base_freq *= rolloff
+
+    # The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor)
+    # using the sinc interpolation formula:
+    #   x(t) = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - t))
+    # We can then sample the function x(t) with a different sample rate:
+    #    y[j] = x(j / new_freq)
+    # or,
+    #    y[j] = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
+
+    # We see here that y[j] is the convolution of x[i] with a specific filter, for which
+    # we take an FIR approximation, stopping when we see at least `lowpass_filter_width` zeros crossing.
+    # But y[j+1] is going to have a different set of weights and so on, until y[j + new_freq].
+    # Indeed:
+    # y[j + new_freq] = sum_i x[i] sinc(pi * orig_freq * ((i / orig_freq - (j + new_freq) / new_freq))
+    #                 = sum_i x[i] sinc(pi * orig_freq * ((i - orig_freq) / orig_freq - j / new_freq))
+    #                 = sum_i x[i + orig_freq] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
+    # so y[j+new_freq] uses the same filter as y[j], but on a shifted version of x by `orig_freq`.
+    # This will explain the F.conv1d after, with a stride of orig_freq.
+    width = math.ceil(lowpass_filter_width * orig_freq / base_freq)
+    # If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e.,
+    # they will have a lot of almost zero values to the left or to the right...
+    # There is probably a way to evaluate those filters more efficiently, but this is kept for
+    # future work.
+    idx_dtype = dtype if dtype is not None else torch.float64
+
+    idx = torch.arange(-width, width + orig_freq, dtype=idx_dtype, device=device)[None, None] / orig_freq
+
+    t = torch.arange(0, -new_freq, -1, dtype=dtype, device=device)[:, None, None] / new_freq + idx
+    t *= base_freq
+    t = t.clamp_(-lowpass_filter_width, lowpass_filter_width)
+
+    # we do not use built in torch windows here as we need to evaluate the window
+    # at specific positions, not over a regular grid.
+    if resampling_method == "sinc_interp_hann":
+        window = torch.cos(t * math.pi / lowpass_filter_width / 2) ** 2
+    else:
+        # sinc_interp_kaiser
+        if beta is None:
+            beta = 14.769656459379492
+        beta_tensor = torch.tensor(float(beta))
+        window = torch.i0(beta_tensor * torch.sqrt(1 - (t / lowpass_filter_width) ** 2)) / torch.i0(beta_tensor)
+
+    t *= math.pi
+
+    scale = base_freq / orig_freq
+    kernels = torch.where(t == 0, torch.tensor(1.0).to(t), t.sin() / t)
+    kernels *= window * scale
+
+    if dtype is None:
+        kernels = kernels.to(dtype=torch.float32)
+
+    return kernels, width
+
+
+def _apply_sinc_resample_kernel(
+    waveform: Tensor,
+    orig_freq: int,
+    new_freq: int,
+    gcd: int,
+    kernel: Tensor,
+    width: int,
+):
+    if not waveform.is_floating_point():
+        raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.")
+
+    orig_freq = int(orig_freq) // gcd
+    new_freq = int(new_freq) // gcd
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.view(-1, shape[-1])
+
+    num_wavs, length = waveform.shape
+    waveform = torch.nn.functional.pad(waveform, (width, width + orig_freq))
+    resampled = torch.nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq)
+    resampled = resampled.transpose(1, 2).reshape(num_wavs, -1)
+    target_length = torch.ceil(torch.as_tensor(new_freq * length / orig_freq)).long()
+    resampled = resampled[..., :target_length]
+
+    # unpack batch
+    resampled = resampled.view(shape[:-1] + resampled.shape[-1:])
+    return resampled
+
+
+def spectrogram(
+    waveform: Tensor,
+    pad: int,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    power: Optional[float],
+    normalized: Union[bool, str],
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+    return_complex: Optional[bool] = None,
+) -> Tensor:
+    r"""Create a spectrogram or a batch of spectrograms from a raw audio signal.
+    The spectrogram can be either magnitude-only or complex.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(..., time)`
+        pad (int): Two sided padding of signal
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT
+        hop_length (int): Length of hop between STFT windows
+        win_length (int): Window size
+        power (float or None): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
+            If None, then the complex spectrum is returned instead.
+        normalized (bool or str): Whether to normalize by magnitude after stft. If input is str, choices are
+            ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
+            ``"window"``. When normalized on ``"window"``, waveform is normalized upon the window's L2 energy. If
+            normalized on ``"frame_length"``, waveform is normalized by dividing by
+            :math:`(\text{frame\_length})^{0.5}`.
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            Default: ``True``
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. Default: ``"reflect"``
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy. Default: ``True``
+        return_complex (bool, optional):
+            Deprecated and not used.
+
+    Returns:
+        Tensor: Dimension `(..., freq, time)`, freq is
+        ``n_fft // 2 + 1`` and ``n_fft`` is the number of
+        Fourier bins, and time is the number of window hops (n_frame).
+    """
+    if return_complex is not None:
+        warnings.warn(
+            "`return_complex` argument is now deprecated and is not effective."
+            "`torchaudio.functional.spectrogram(power=None)` always returns a tensor with "
+            "complex dtype. Please remove the argument in the function call."
+        )
+
+    if pad > 0:
+        # TODO add "with torch.no_grad():" back when JIT supports it
+        waveform = torch.nn.functional.pad(waveform, (pad, pad), "constant")
+
+    frame_length_norm, window_norm = _get_spec_norms(normalized)
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+
+    # default values are consistent with librosa.core.spectrum._spectrogram
+    spec_f = torch.stft(
+        input=waveform,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode,
+        normalized=frame_length_norm,
+        onesided=onesided,
+        return_complex=True,
+    )
+
+    # unpack batch
+    spec_f = spec_f.reshape(shape[:-1] + spec_f.shape[-2:])
+
+    if window_norm:
+        spec_f /= window.pow(2.0).sum().sqrt()
+    if power is not None:
+        if power == 1.0:
+            return spec_f.abs()
+        return spec_f.abs().pow(power)
+    return spec_f
+
+
+def _get_spec_norms(normalized: Union[str, bool]):
+    frame_length_norm, window_norm = False, False
+    if torch.jit.isinstance(normalized, str):
+        if normalized not in ["frame_length", "window"]:
+            raise ValueError("Invalid normalized parameter: {}".format(normalized))
+        if normalized == "frame_length":
+            frame_length_norm = True
+        elif normalized == "window":
+            window_norm = True
+    elif torch.jit.isinstance(normalized, bool):
+        if normalized:
+            window_norm = True
+    else:
+        raise TypeError("Input type not supported")
+    return frame_length_norm, window_norm
+
+
+def amplitude_to_DB(
+    x: Tensor, multiplier: float, amin: float, db_multiplier: float, top_db: Optional[float] = None
+) -> Tensor:
+    r"""Turn a spectrogram from the power/amplitude scale to the decibel scale.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The output of each tensor in a batch depends on the maximum value of that tensor,
+    and so may return different values for an audio clip split into snippets vs. a full clip.
+
+    Args:
+
+        x (Tensor): Input spectrogram(s) before being converted to decibel scale.
+            The expected shapes are ``(freq, time)``, ``(channel, freq, time)`` or
+            ``(..., batch, channel, freq, time)``.
+
+            .. note::
+
+               When ``top_db`` is specified, cut-off values are computed for each audio
+               in the batch. Therefore if the input shape is 4D (or larger), different
+               cut-off values are used for audio data in the batch.
+               If the input shape is 2D or 3D, a single cutoff value is used.
+
+        multiplier (float): Use 10. for power and 20. for amplitude
+        amin (float): Number to clamp ``x``
+        db_multiplier (float): Log10(max(reference value and amin))
+        top_db (float or None, optional): Minimum negative cut-off in decibels. A reasonable number
+            is 80. (Default: ``None``)
+
+    Returns:
+        Tensor: Output tensor in decibel scale
+    """
+    x_db = multiplier * torch.log10(torch.clamp(x, min=amin))
+    x_db -= multiplier * db_multiplier
+
+    if top_db is not None:
+        # Expand batch
+        shape = x_db.size()
+        packed_channels = shape[-3] if x_db.dim() > 2 else 1
+        x_db = x_db.reshape(-1, packed_channels, shape[-2], shape[-1])
+
+        x_db = torch.max(x_db, (x_db.amax(dim=(-3, -2, -1)) - top_db).view(-1, 1, 1, 1))
+
+        # Repack batch
+        x_db = x_db.reshape(shape)
+
+    return x_db
+
+
+def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor:
+    r"""Create a DCT transformation matrix with shape (``n_mels``, ``n_mfcc``),
+    normalized depending on norm.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Args:
+        n_mfcc (int): Number of mfc coefficients to retain
+        n_mels (int): Number of mel filterbanks
+        norm (str or None): Norm to use (either "ortho" or None)
+
+    Returns:
+        Tensor: The transformation matrix, to be right-multiplied to
+        row-wise data of size (``n_mels``, ``n_mfcc``).
+    """
+
+    if norm is not None and norm != "ortho":
+        raise ValueError('norm must be either "ortho" or None')
+
+    # http://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II
+    n = torch.arange(float(n_mels))
+    k = torch.arange(float(n_mfcc)).unsqueeze(1)
+    dct = torch.cos(math.pi / float(n_mels) * (n + 0.5) * k)  # size (n_mfcc, n_mels)
+
+    if norm is None:
+        dct *= 2.0
+    else:
+        dct[0] *= 1.0 / math.sqrt(2.0)
+        dct *= math.sqrt(2.0 / float(n_mels))
+    return dct.t()
+
+
+def melscale_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_mels: int,
+    sample_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+) -> Tensor:
+    r"""Create a frequency bin conversion matrix.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        For the sake of the numerical compatibility with librosa, not all the coefficients
+        in the resulting filter bank has magnitude of 1.
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/mel_fbanks.png
+           :alt: Visualization of generated filter bank
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_mels (int): Number of mel filterbanks
+        sample_rate (int): Sample rate of the audio waveform
+        norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A @ melscale_fbanks(A.size(-1), ...)``.
+
+    """
+
+    if norm is not None and norm != "slaney":
+        raise ValueError('norm must be one of None or "slaney"')
+
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+
+    # calculate mel freq bins
+    m_min = _hz_to_mel(f_min, mel_scale=mel_scale)
+    m_max = _hz_to_mel(f_max, mel_scale=mel_scale)
+
+    m_pts = torch.linspace(m_min, m_max, n_mels + 2)
+    f_pts = _mel_to_hz(m_pts, mel_scale=mel_scale)
+
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
+        fb *= enorm.unsqueeze(0)
+
+    if (fb.max(dim=0).values == 0.0).any():
+        warnings.warn(
+            "At least one mel filterbank has all zero values. "
+            f"The value for `n_mels` ({n_mels}) may be set too high. "
+            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+        )
+
+    return fb
+
+
+def _hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
+    r"""Convert Hz to Mels.
+
+    Args:
+        freqs (float): Frequencies in Hz
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        mels (float): Frequency in Mels
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 2595.0 * math.log10(1.0 + (freq / 700.0))
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+
+    if freq >= min_log_hz:
+        mels = min_log_mel + math.log(freq / min_log_hz) / logstep
+
+    return mels
+
+
+def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor:
+    """Convert mel bin numbers to frequencies.
+
+    Args:
+        mels (Tensor): Mel frequencies
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        freqs (Tensor): Mels converted in Hz
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mels
+
+    # And now the nonlinear scale
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+
+    log_t = mels >= min_log_mel
+    freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel))
+
+    return freqs
+
+
+def _create_triangular_filterbank(
+    all_freqs: Tensor,
+    f_pts: Tensor,
+) -> Tensor:
+    """Create a triangular filter bank.
+
+    Args:
+        all_freqs (Tensor): STFT freq points of size (`n_freqs`).
+        f_pts (Tensor): Filter mid points of size (`n_filter`).
+
+    Returns:
+        fb (Tensor): The filter bank of size (`n_freqs`, `n_filter`).
+    """
+    # Adopted from Librosa
+    # calculate the difference between each filter mid point and each stft freq point in hertz
+    f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
+    slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)  # (n_freqs, n_filter + 2)
+    # create overlapping triangles
+    zero = torch.zeros(1)
+    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
+    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_filter)
+    fb = torch.max(zero, torch.min(down_slopes, up_slopes))
+
+    return fb
diff --git a/nemo/collections/speechlm2/models/duplex_s2s_model.py b/nemo/collections/speechlm2/models/duplex_s2s_model.py
index 2de158d88be9..f3c865f63a45 100644
--- a/nemo/collections/speechlm2/models/duplex_s2s_model.py
+++ b/nemo/collections/speechlm2/models/duplex_s2s_model.py
@@ -30,7 +30,7 @@
 )
 from transformers import DynamicCache
 
-from nemo.collections.audio.parts.utils.resampling import resample
+from nemo.collections.audio.parts.utils.transforms import resample
 from nemo.collections.common.tokenizers import AutoTokenizer
 from nemo.collections.speechlm2.data.utils import get_pad_id
 from nemo.collections.speechlm2.parts.hf_hub import HFHubMixin
diff --git a/nemo/collections/speechlm2/models/duplex_s2s_speech_decoder_model.py b/nemo/collections/speechlm2/models/duplex_s2s_speech_decoder_model.py
index 3605e886b3e4..7724b4d0e01d 100644
--- a/nemo/collections/speechlm2/models/duplex_s2s_speech_decoder_model.py
+++ b/nemo/collections/speechlm2/models/duplex_s2s_speech_decoder_model.py
@@ -29,7 +29,7 @@
 )
 from transformers import DynamicCache
 
-from nemo.collections.audio.parts.utils.resampling import resample
+from nemo.collections.audio.parts.utils.transforms import resample
 from nemo.collections.common.tokenizers import AutoTokenizer
 from nemo.collections.speechlm2.data.utils import get_pad_id
 from nemo.collections.speechlm2.models.duplex_s2s_model import replace_control_speech_codes, tokens_to_str
diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index f37818e43d89..2cdd5f0f8c9c 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import itertools
+from contextlib import nullcontext
 from math import ceil
 from pathlib import Path
 from typing import List, Tuple
@@ -24,6 +25,7 @@
 from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
 
+from nemo.collections.audio.parts.utils.transforms import Resample, resample
 from nemo.collections.common.parts.utils import mask_sequence_tensor
 from nemo.collections.tts.losses.audio_codec_loss import (
     FeatureMatchingLoss,
@@ -51,13 +53,6 @@
 from nemo.core.optim.lr_scheduler import compute_max_steps, prepare_lr_scheduler
 from nemo.utils import logging, model_utils
 
-try:
-    import torchaudio
-
-    HAVE_TORCHAUDIO = True
-except ModuleNotFoundError:
-    HAVE_TORCHAUDIO = False
-
 
 class AudioCodecModel(ModelPT):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
@@ -192,7 +187,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             )
             # freeze the pretrained speaker encoder
             self.speaker_encoder.freeze()
-            print("Speaker encoder loaded and frozen !!")
+            logging.info("Speaker encoder loaded and frozen !!")
+            self.speaker_encoder_resampler = Resample(
+                orig_freq=self.sample_rate, new_freq=self.speaker_encoder.audio_config["sample_rate"]
+            )
 
         # Disabled for now as it is not used in final model
         self.use_asr_consitency_loss = False
@@ -254,24 +252,9 @@ def load_state_dict(self, state_dict, strict=True):
         super().load_state_dict(state_dict, strict=False)
 
     def get_speaker_embedding(self, audio, requires_grad=False):
-        if not requires_grad:
-            with torch.no_grad():
-                if HAVE_TORCHAUDIO:
-                    audio_resampled = torchaudio.functional.resample(
-                        audio, self.sample_rate, self.speaker_encoder.audio_config["sample_rate"]
-                    )
-                else:
-                    logging.error('Could not import torchaudio!')
-                    raise ModuleNotFoundError("torchaudio is not installed but is necessary to audio resample !!")
-                g = self.speaker_encoder(audio_resampled, l2_norm=True).unsqueeze(-1)
-        else:
-            if HAVE_TORCHAUDIO:
-                audio_resampled = torchaudio.functional.resample(
-                    audio, self.sample_rate, self.speaker_encoder.audio_config["sample_rate"]
-                )
-            else:
-                logging.error('Could not import torchaudio!')
-                raise ModuleNotFoundError("torchaudio is not installed but is necessary to audio resample !!")
+        grad_context = nullcontext() if requires_grad else torch.no_grad()
+        with grad_context:
+            audio_resampled = self.speaker_encoder_resampler(audio)
             g = self.speaker_encoder(audio_resampled, l2_norm=True).unsqueeze(-1)
 
         return g
@@ -506,10 +489,7 @@ def pad_audio(self, audio, audio_len, samples_per_frame):
 
     def preprocess_audio(self, audio, audio_len, sample_rate):
         if sample_rate and sample_rate != self.sample_rate:
-            if not HAVE_TORCHAUDIO:
-                raise ModuleNotFoundError("Must install torchaudio for resampling.")
-
-            audio = torchaudio.functional.resample(waveform=audio, orig_freq=sample_rate, new_freq=self.sample_rate)
+            audio = resample(waveform=audio, orig_freq=sample_rate, new_freq=self.sample_rate)
             audio_len_scaled = audio_len.long() * self.sample_rate
             new_audio_len = audio_len_scaled / sample_rate
             # To avoid rounding issues at lower precisions, do not call torch.ceil when the length is divisible by the sample rate
diff --git a/nemo/collections/tts/modules/audio_codec_modules.py b/nemo/collections/tts/modules/audio_codec_modules.py
index 8ce522d798ec..3a0506f06d2c 100755
--- a/nemo/collections/tts/modules/audio_codec_modules.py
+++ b/nemo/collections/tts/modules/audio_codec_modules.py
@@ -25,6 +25,7 @@
 from transformers import AutoModel
 
 from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor
+from nemo.collections.audio.parts.utils.transforms import MelSpectrogram, Resample
 from nemo.collections.common.parts.utils import ClampActivation, HalfSnake, Snake, mask_sequence_tensor
 from nemo.core.classes.common import typecheck
 from nemo.core.classes.module import NeuralModule
@@ -39,13 +40,6 @@
 from nemo.core.neural_types.neural_type import NeuralType
 from nemo.utils import logging
 
-try:
-    import torchaudio
-
-    HAVE_TORCHAUDIO = True
-except ModuleNotFoundError:
-    HAVE_TORCHAUDIO = False
-
 try:
     import fsspec
 
@@ -121,11 +115,7 @@ def __init__(
     ):
         super().__init__()
 
-        if HAVE_TORCHAUDIO:
-            self.resample = torchaudio.transforms.Resample(input_sr, slm_sr)
-        else:
-            self.resample = None
-
+        self.resample = Resample(orig_freq=input_sr, new_freq=slm_sr)
         self.slm_model = SSLModel(slm_model_name)
 
         # Freeze slm model
@@ -353,11 +343,7 @@ def __init__(
         self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
 
         self.instancenorm = nn.InstanceNorm1d(input_dim)
-
-        if self.use_torch_spec and HAVE_TORCHAUDIO:
-            self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
-        else:
-            self.torch_spec = None
+        self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config) if self.use_torch_spec else None
 
         outmap_size = int(self.input_dim / 8)
 
@@ -460,7 +446,7 @@ def forward(self, x, l2_norm=False):
     def get_torch_mel_spectrogram_class(self, audio_config):
         return torch.nn.Sequential(
             PreEmphasis(audio_config["preemphasis"]),
-            torchaudio.transforms.MelSpectrogram(
+            MelSpectrogram(
                 sample_rate=audio_config["sample_rate"],
                 n_fft=audio_config["fft_size"],
                 win_length=audio_config["win_length"],
diff --git a/scripts/installers/install_torchaudio_latest.sh b/scripts/installers/install_torchaudio_latest.sh
deleted file mode 100755
index bdad771fe267..000000000000
--- a/scripts/installers/install_torchaudio_latest.sh
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Torch and torchaudio versions must match. Othervise, there will be no CUDA support.
-# See https://github.com/pytorch/audio/blob/f0bc00c980012badea8db011f84a0e9ef33ba6c1/README.md?plain=1#L66
-
-DEPENDENCIES_INSTALL_CMD="apt update && apt install -y ffmpeg sox libavdevice-dev"
-
-read -r -d '' INFO_MESSAGE << EOM
-INFO: This script is supposed to be used when building a docker container using Dockerfile in NeMo.
-Use the script only for compiling torchaudio from scratch with a Non-Standard PyTorch version (e.g., 2.1.0a0+32f93b1)
-For the release PyTorch version (e.g., 2.1.0), use 'pip install torchaudio' instead.
-If running stand-alone, install dependencies first: '${DEPENDENCIES_INSTALL_CMD}'
-EOM
-
-echo "$INFO_MESSAGE"
-
-for lib in libavdevice sox; do
-  if ! grep -q ${lib} <<< "$(ldconfig -p)"; then
-    echo "ERROR: ${lib} not found. Install dependencies before running the script: '${DEPENDENCIES_INSTALL_CMD}'"
-    exit 1
-  fi
-done
-
-if ! command -v ffmpeg &> /dev/null; then
-  echo "ERROR: ffmpeg not found. Install dependencies before running the script: '${DEPENDENCIES_INSTALL_CMD}'"
-  exit 1
-fi
-
-TORCHAUDIO_REPO=https://github.com/pytorch/audio
-# expected LATEST_RELEASE=release/*.**
-LATEST_RELEASE=$(git -c 'versionsort.suffix=-' \
-    ls-remote --exit-code --refs --sort='version:refname' --heads ${TORCHAUDIO_REPO} 'release/*.*' \
-    | tail --lines=1 \
-    | cut -d '/' -f 3,4)
-TORCHAUDIO_LATEST_MAJOR_VERSION=$(python3 -c "major_version = (\"${LATEST_RELEASE}\".split('/')[-1]).split('.')[0]; print(major_version)")
-TORCHAUDIO_LATEST_MINOR_VERSION=$(python3 -c "minor_version = \"${LATEST_RELEASE}\".rsplit('.')[-1]; print(minor_version)")
-
-# avoid checking PYTORCH_VERSION variable, not available everywhere
-TORCH_FULL_VERSION=$(python3 -c "import torch; print(torch.__version__)")
-TORCH_MAIN_VERSION=$(python3 -c "import torch, re; print(re.search(r'(\d+\.?)+', torch.__version__).group(0))")
-TORCH_MAJOR_VERSION=$(python3 -c "major_version = \"${TORCH_MAIN_VERSION}\".split('.')[0]; print(major_version)")
-TORCH_MINOR_VERSION=$(python3 -c "minor_version = \"${TORCH_MAIN_VERSION}\".split('.')[1]; print(minor_version)")
-TORCH_FIX_VERSION=$(python3 -c "minor_version = \"${TORCH_MAIN_VERSION}\".split('.')[2]; print(minor_version)")
-
-
-echo "Latest torchaudio release: ${TORCHAUDIO_LATEST_MAJOR_VERSION}.${TORCHAUDIO_LATEST_MINOR_VERSION}"
-echo "Pytorch version: ${TORCH_MAIN_VERSION:0:6}"
-
-if [[ $TORCH_MAJOR_VERSION -eq 1 ]]; then
-  if [[ $TORCH_MINOR_VERSION -le 13 ]]; then
-    INSTALL_BRANCH="release/0.${TORCH_MINOR_VERSION}"
-  else
-    # fix for PyTorch 1.14 (no official release)
-    INSTALL_BRANCH="release/2.0"
-  fi
-  TORCHAUDIO_MAJOR_VERSION=0
-else  # version 2 expected
-  TORCHAUDIO_MAJOR_VERSION=${TORCH_MAJOR_VERSION}
-  INSTALL_BRANCH="release/${TORCH_MAJOR_VERSION}.${TORCH_MINOR_VERSION}"
-fi
-
-
-# check if install branch exists
-if [[ $(git ls-remote --heads ${TORCHAUDIO_REPO} ${INSTALL_BRANCH} | wc -l) -eq 0 ]]
-then
-  echo "Branch ${INSTALL_BRANCH} does not exist in torchaudio repo. Using latest release."
-  INSTALL_BRANCH=${LATEST_RELEASE}
-fi
-
-# expected TORCHAUDIO_BUILD_VERSION=*.**.*
-TORCHAUDIO_BUILD_VERSION="${TORCHAUDIO_MAJOR_VERSION}.${TORCH_MINOR_VERSION}.${TORCH_FIX_VERSION}"
-
-echo "Torchaudio build version: ${TORCHAUDIO_BUILD_VERSION}"
-echo "Installing torchaudio from branch: ${INSTALL_BRANCH}"
-
-# we need parameterized to run torchaudio tests
-# suppose that we do not have parameterized installed yet
-pip install parameterized
-
-# Build torchaudio and run MFCC test
-# NB: setting PYTORCH_VERSION is a workaround for the case where PYTORCH_VERSION is set, but contains incorrect value
-# e.g., in container nvcr.io/nvidia/pytorch:24.03-py3
-git clone --depth 1 --branch ${INSTALL_BRANCH} https://github.com/pytorch/audio.git && \
-cd audio && \
-git submodule update --init --recursive && \
-PYTORCH_VERSION=${TORCH_FULL_VERSION} USE_FFMPEG=1 BUILD_SOX=1 BUILD_VERSION=${TORCHAUDIO_BUILD_VERSION} python setup.py install && \
-cd .. && \
-pytest -rs audio/test/torchaudio_unittest/transforms/torchscript_consistency_cpu_test.py -k 'test_MFCC' || \
-{ echo "ERROR: Failed to install torchaudio!"; exit 1; };
-# RNNT loss is built with CUDA, so checking it will suffice
-# This test will be skipped if CUDA is not available (e.g. when building from docker)
-pytest -rs audio/test/torchaudio_unittest/functional/torchscript_consistency_cuda_test.py -k 'test_rnnt_loss' || \
-echo "WARNING: Failed to install torchaudio with CUDA support!";
-rm -rf audio && \
-echo "Torchaudio installed successfully!"
diff --git a/tests/collections/audio/test_audio_losses.py b/tests/collections/audio/test_audio_losses.py
index 61875751796d..33d25bd8f182 100644
--- a/tests/collections/audio/test_audio_losses.py
+++ b/tests/collections/audio/test_audio_losses.py
@@ -29,16 +29,6 @@
     convolution_invariant_target,
     scale_invariant_target,
 )
-
-try:
-    import importlib
-
-    importlib.import_module('torchaudio')
-
-    HAVE_TORCHAUDIO = True
-except ModuleNotFoundError:
-    HAVE_TORCHAUDIO = False
-
 from nemo.collections.audio.losses.maxine import CombinedLoss
 from nemo.collections.audio.parts.utils.audio import (
     calculate_sdr_numpy,
@@ -1089,7 +1079,6 @@ def test_mae_invalid_ndim(self):
             MAELoss(ndim=5)
 
     @pytest.mark.unit
-    @pytest.mark.skipif(not HAVE_TORCHAUDIO, reason="Modules in this test require torchaudio")
     def test_maxine_combined_loss(self, test_data_dir):
         INPUT_LOCATION = os.path.join(test_data_dir, 'audio', 'maxine', 'input.bin')
         ATOL = 1e-2
diff --git a/tests/collections/audio/test_audio_maxine_models.py b/tests/collections/audio/test_audio_maxine_models.py
index 9c54a06ef0a1..5ae6793edf89 100644
--- a/tests/collections/audio/test_audio_maxine_models.py
+++ b/tests/collections/audio/test_audio_maxine_models.py
@@ -16,15 +16,6 @@
 import torch
 from omegaconf import DictConfig
 
-try:
-    import importlib
-
-    importlib.import_module('torchaudio')
-
-    HAVE_TORCHAUDIO = True
-except ModuleNotFoundError:
-    HAVE_TORCHAUDIO = False
-
 from nemo.collections.audio.models.maxine import BNR2
 
 
@@ -81,7 +72,6 @@ class TestBNR2Model:
     """Test BNR 2 model."""
 
     @pytest.mark.unit
-    @pytest.mark.skipif(not HAVE_TORCHAUDIO, reason="Modules in this test require torchaudio")
     def test_constructor(self, maxine_model_fixture):
         """Test that the model can be constructed from a config dict."""
         model = maxine_model_fixture.train()
@@ -90,7 +80,6 @@ def test_constructor(self, maxine_model_fixture):
         assert isinstance(instance2, BNR2)
 
     @pytest.mark.unit
-    @pytest.mark.skipif(not HAVE_TORCHAUDIO, reason="Modules in this test require torchaudio")
     @pytest.mark.parametrize(
         "batch_size, sample_len",
         [
diff --git a/tests/collections/audio/test_audio_metrics.py b/tests/collections/audio/test_audio_metrics.py
index 578b67fc2479..0e221c37e7a5 100644
--- a/tests/collections/audio/test_audio_metrics.py
+++ b/tests/collections/audio/test_audio_metrics.py
@@ -17,6 +17,7 @@
 
 from nemo.collections.audio.metrics.audio import AudioMetricWrapper
 from nemo.collections.audio.metrics.squim import SquimMOSMetric, SquimObjectiveMetric
+from nemo.collections.audio.parts.utils.transforms import Resample
 
 try:
     import torchaudio
@@ -165,7 +166,7 @@ def test_squim_mos(self, fs: int):
             squim_mos_metric = SquimMOSMetric(fs=fs)
 
             # Helper function
-            resampler = torchaudio.transforms.Resample(
+            resampler = Resample(
                 orig_freq=fs,
                 new_freq=16000,
                 lowpass_filter_width=64,
@@ -222,7 +223,7 @@ def test_squim_objective(self, metric: str, fs: int):
             squim_objective_metric = SquimObjectiveMetric(fs=fs, metric=metric)
 
             # Helper function
-            resampler = torchaudio.transforms.Resample(
+            resampler = Resample(
                 orig_freq=fs,
                 new_freq=16000,
                 lowpass_filter_width=64,
diff --git a/tests/collections/audio/test_audio_models_flow_matching.py b/tests/collections/audio/test_audio_models_flow_matching.py
index 65ddd564e478..b24424c7570a 100644
--- a/tests/collections/audio/test_audio_models_flow_matching.py
+++ b/tests/collections/audio/test_audio_models_flow_matching.py
@@ -24,7 +24,7 @@
 import torch
 from omegaconf import DictConfig
 
-from nemo.collections.audio.models import FlowMatchingAudioToAudioModel
+from nemo.collections.audio.models.enhancement import FlowMatchingAudioToAudioModel
 
 
 def convert_to_dictconfig(d):
@@ -79,7 +79,7 @@ def flow_matching_base_config(request):
         'time_max': flow['time_max'],
     }
 
-    loss = {'_target_': 'nemo.collections.audio.losses.MSELoss', 'ndim': 4}
+    loss = {'_target_': 'nemo.collections.audio.losses.audio.MSELoss', 'ndim': 4}
 
     estimator = {
         '_target_': 'nemo.collections.audio.parts.submodules.transformerunet.SpectrogramTransformerUNet',
diff --git a/tests/collections/audio/test_audio_models_mask.py b/tests/collections/audio/test_audio_models_mask.py
index f847fcbaf313..a6d766818f98 100644
--- a/tests/collections/audio/test_audio_models_mask.py
+++ b/tests/collections/audio/test_audio_models_mask.py
@@ -25,7 +25,7 @@
 import torch
 from omegaconf import DictConfig
 
-from nemo.collections.audio.models import EncMaskDecAudioToAudioModel
+from nemo.collections.audio.models.enhancement import EncMaskDecAudioToAudioModel
 
 
 @pytest.fixture(params=["nemo_manifest", "lhotse_cuts"])
@@ -111,7 +111,7 @@ def mask_model_rnn_params():
     }
 
     loss = {
-        '_target_': 'nemo.collections.audio.losses.SDRLoss',
+        '_target_': 'nemo.collections.audio.losses.audio.SDRLoss',
         'scale_invariant': True,
     }
 
@@ -212,7 +212,7 @@ def mask_model_flexarray():
     }
 
     loss = {
-        '_target_': 'nemo.collections.audio.losses.SDRLoss',
+        '_target_': 'nemo.collections.audio.losses.audio.SDRLoss',
         'scale_invariant': True,
     }
 
diff --git a/tests/collections/audio/test_audio_models_predictive.py b/tests/collections/audio/test_audio_models_predictive.py
index b02f7e810ed1..55a688dbdccc 100644
--- a/tests/collections/audio/test_audio_models_predictive.py
+++ b/tests/collections/audio/test_audio_models_predictive.py
@@ -23,7 +23,7 @@
 import torch
 from omegaconf import DictConfig
 
-from nemo.collections.audio.models import PredictiveAudioToAudioModel
+from nemo.collections.audio.models.enhancement import PredictiveAudioToAudioModel
 
 
 @pytest.fixture(params=["nemo_manifest", "lhotse_cuts"])
@@ -111,7 +111,7 @@ def predictive_model_ncsn():
     }
 
     loss = {
-        '_target_': 'nemo.collections.audio.losses.MSELoss',  # computed in the time domain
+        '_target_': 'nemo.collections.audio.losses.audio.MSELoss',  # computed in the time domain
     }
 
     model_config = DictConfig(
@@ -183,7 +183,7 @@ def predictive_model_conformer():
     }
 
     loss = {
-        '_target_': 'nemo.collections.audio.losses.MSELoss',  # computed in the time domain
+        '_target_': 'nemo.collections.audio.losses.audio.MSELoss',  # computed in the time domain
     }
 
     model_config = DictConfig(
@@ -255,7 +255,7 @@ def predictive_model_streaming_conformer():
     }
 
     loss = {
-        '_target_': 'nemo.collections.audio.losses.MSELoss',  # computed in the time domain
+        '_target_': 'nemo.collections.audio.losses.audio.MSELoss',  # computed in the time domain
     }
 
     model_config = DictConfig(
@@ -318,7 +318,7 @@ def predictive_model_transformer_unet_params_base():
     }
 
     loss = {
-        '_target_': 'nemo.collections.audio.losses.MSELoss',  # computed in the time domain
+        '_target_': 'nemo.collections.audio.losses.audio.MSELoss',  # computed in the time domain
     }
 
     model_config = DictConfig(
@@ -384,7 +384,7 @@ def predictive_model_conformer_unet():
     }
 
     loss = {
-        '_target_': 'nemo.collections.audio.losses.MSELoss',  # computed in the time domain
+        '_target_': 'nemo.collections.audio.losses.audio.MSELoss',  # computed in the time domain
     }
 
     model_config = DictConfig(
@@ -456,7 +456,7 @@ def predictive_model_streaming_conformer_unet():
     }
 
     loss = {
-        '_target_': 'nemo.collections.audio.losses.MSELoss',  # computed in the time domain
+        '_target_': 'nemo.collections.audio.losses.audio.MSELoss',  # computed in the time domain
     }
 
     model_config = DictConfig(
diff --git a/tests/collections/audio/test_audio_models_schroedinger_bridge.py b/tests/collections/audio/test_audio_models_schroedinger_bridge.py
index 00018764f927..6d4a228092b3 100644
--- a/tests/collections/audio/test_audio_models_schroedinger_bridge.py
+++ b/tests/collections/audio/test_audio_models_schroedinger_bridge.py
@@ -24,7 +24,7 @@
 import torch
 from omegaconf import DictConfig
 
-from nemo.collections.audio.models import SchroedingerBridgeAudioToAudioModel
+from nemo.collections.audio.models.enhancement import SchroedingerBridgeAudioToAudioModel
 
 
 @pytest.fixture(params=["nemo_manifest", "lhotse_cuts"])
@@ -112,9 +112,12 @@ def schroedinger_bridge_model_ncsn_params():
         'pad_dimension_to': 0,  # no padding in the frequency dimension
     }
 
-    loss_encoded = {'_target_': 'nemo.collections.audio.losses.MSELoss', 'ndim': 4}  # computed in the time domain
+    loss_encoded = {
+        '_target_': 'nemo.collections.audio.losses.audio.MSELoss',
+        'ndim': 4,
+    }  # computed in the time domain
 
-    loss_time = {'_target_': 'nemo.collections.audio.losses.MAELoss'}
+    loss_time = {'_target_': 'nemo.collections.audio.losses.audio.MAELoss'}
 
     noise_schedule = {
         '_target_': 'nemo.collections.audio.parts.submodules.schroedinger_bridge.SBNoiseScheduleVE',
diff --git a/tests/collections/audio/test_audio_models_score_based.py b/tests/collections/audio/test_audio_models_score_based.py
index 7028c3d285f7..03942ebdfb2f 100644
--- a/tests/collections/audio/test_audio_models_score_based.py
+++ b/tests/collections/audio/test_audio_models_score_based.py
@@ -24,7 +24,7 @@
 import torch
 from omegaconf import DictConfig
 
-from nemo.collections.audio.models import ScoreBasedGenerativeAudioToAudioModel
+from nemo.collections.audio.models.enhancement import ScoreBasedGenerativeAudioToAudioModel
 
 
 def convert_to_dictconfig(d):
@@ -87,7 +87,7 @@ def score_based_base_config():
         'snr': 0.5,
     }
 
-    loss = {'_target_': 'nemo.collections.audio.losses.MSELoss', 'ndim': 4}
+    loss = {'_target_': 'nemo.collections.audio.losses.audio.MSELoss', 'ndim': 4}
 
     trainer = {
         'max_epochs': -1,
diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb
index 10e0e392da6a..c221b43640ef 100644
--- a/tutorials/00_NeMo_Primer.ipynb
+++ b/tutorials/00_NeMo_Primer.ipynb
@@ -45,9 +45,6 @@
         "BRANCH = 'main'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
-        "## Install TorchAudio\n",
-        "!pip install torchaudio>=0.10.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
-        "\n",
         "## Grab the config we'll use in this example\n",
         "!mkdir configs"
       ]
@@ -795,7 +792,7 @@
       },
       "outputs": [],
       "source": [
-        "!ls -d -- *.nemo "
+        "!ls -d -- *.nemo"
       ]
     },
     {
diff --git a/tutorials/01_NeMo_Models.ipynb b/tutorials/01_NeMo_Models.ipynb
index 7d80c8e96de5..9d0967474cff 100644
--- a/tutorials/01_NeMo_Models.ipynb
+++ b/tutorials/01_NeMo_Models.ipynb
@@ -28,9 +28,6 @@
         "BRANCH = 'main'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
-        "## Install TorchAudio\n",
-        "!pip install torchaudio>=0.10.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
-        "\n",
         "## Grab the config we'll use in this example\n",
         "!mkdir configs"
       ]
@@ -860,7 +857,7 @@
       "source": [
         "# Custom element types are now imported from helper_files.gpt_components:\n",
         "# - AttentionType(EncodedRepresentation): Basic Attention Element Type\n",
-        "# - SelfAttentionType(AttentionType): Self Attention Element Type  \n",
+        "# - SelfAttentionType(AttentionType): Self Attention Element Type\n",
         "# - CausalSelfAttentionType(SelfAttentionType): Causal Self Attention Element Type\n",
         "print(\"Custom element types imported successfully!\")"
       ]
@@ -1192,7 +1189,7 @@
         "\n",
         "# Example instantiation (with dummy parameters for demonstration)\n",
         "dummy_decoder = GPTDecoder(n_embd=32, vocab_size=100)\n",
-        "print(f\"Input types: {dummy_decoder.input_types}\")  \n",
+        "print(f\"Input types: {dummy_decoder.input_types}\")\n",
         "print(f\"Output types: {dummy_decoder.output_types}\")\n"
       ]
     },
@@ -1305,7 +1302,7 @@
         "# block_size: int # length of the model's context window in time\n",
         "# n_layer: int # depth of the model; number of Transformer blocks in sequence\n",
         "# n_embd: int # the \"width\" of the model, number of channels in each Transformer\n",
-        "# n_head: int # number of heads in each multi-head attention inside each Transformer block  \n",
+        "# n_head: int # number of heads in each multi-head attention inside each Transformer block\n",
         "\n",
         "# model definition args (optional)\n",
         "# ================================\n",
@@ -1681,10 +1678,10 @@
         "\n",
         "  def setup_training_data(self, train_data_config: OmegaConf):\n",
         "    self._train_dl = None\n",
-        "  \n",
+        "\n",
         "  def setup_validation_data(self, val_data_config: OmegaConf):\n",
         "    self._validation_dl = None\n",
-        "  \n",
+        "\n",
         "  def setup_test_data(self, test_data_config: OmegaConf):\n",
         "    self._test_dl = None"
       ]
@@ -1757,7 +1754,7 @@
         "\n",
         "    def test_step(self, *args, **kwargs):\n",
         "        return self.step_('test', *args, **kwargs)\n",
-        "        \n",
+        "\n",
         "    # This is useful for multiple validation data loader setup\n",
         "    def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0):\n",
         "        val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean()\n",
@@ -2159,14 +2156,14 @@
         "        pin_memory=cfg.pin_memory if 'pin_memory' in cfg else False,\n",
         "        num_workers=cfg.num_workers if 'num_workers' in cfg else 0\n",
         "    )\n",
-        "  \n",
+        "\n",
         "  def setup_training_data(self, train_data_config: OmegaConf):\n",
         "    self.vocab = None\n",
         "    self._train_dl = self._setup_data_loader(train_data_config)\n",
-        "  \n",
+        "\n",
         "  def setup_validation_data(self, val_data_config: OmegaConf):\n",
         "    self._validation_dl = self._setup_data_loader(val_data_config)\n",
-        "  \n",
+        "\n",
         "  def setup_test_data(self, test_data_config: OmegaConf):\n",
         "    self._test_dl = self._setup_data_loader(test_data_config)\n"
       ]
@@ -2414,7 +2411,7 @@
       "outputs": [],
       "source": [
         "class NeMoGPTv2(NeMoGPT):\n",
-        "  \n",
+        "\n",
         "  def setup_training_data(self, train_data_config: OmegaConf):\n",
         "    self.vocab = None\n",
         "    self._train_dl = self._setup_data_loader(train_data_config)\n",
@@ -2423,25 +2420,25 @@
         "    with open('vocab.txt', 'w') as f:\n",
         "      for token in self.vocab:\n",
         "        f.write(f\"{token}<SEP>\")\n",
-        "    \n",
+        "\n",
         "    # This is going to register the file into .nemo!\n",
         "    # When you later use .save_to(), it will copy this file into the tar file.\n",
         "    self.register_artifact('vocab_file', 'vocab.txt')\n",
-        "  \n",
+        "\n",
         "  def setup_validation_data(self, val_data_config: OmegaConf):\n",
-        "    # This is going to try to find the same file, and if it fails, \n",
+        "    # This is going to try to find the same file, and if it fails,\n",
         "    # it will use the copy in .nemo\n",
         "    vocab_file = self.register_artifact('vocab_file', 'vocab.txt')\n",
-        "  \n",
+        "\n",
         "    with open(vocab_file, 'r') as f:\n",
         "      vocab = []\n",
         "      vocab = f.read().split('<SEP>')[:-1]  # the -1 here is for the dangling <SEP> token in the file\n",
         "      self.vocab = vocab\n",
         "\n",
         "    self._validation_dl = self._setup_data_loader(val_data_config)\n",
-        "  \n",
+        "\n",
         "  def setup_test_data(self, test_data_config: OmegaConf):\n",
-        "    # This is going to try to find the same file, and if it fails, \n",
+        "    # This is going to try to find the same file, and if it fails,\n",
         "    # it will use the copy in .nemo\n",
         "    vocab_file = self.register_artifact('vocab_file', 'vocab.txt')\n",
         "\n",
diff --git a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
index 1f1cdbc2a48d..db0159977e84 100644
--- a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
+++ b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
@@ -7,9 +7,9 @@
             "outputs": [],
             "source": [
                 "\"\"\"\n",
-                "Please run notebook locally (if you have all the dependencies and a GPU). \n",
+                "Please run notebook locally (if you have all the dependencies and a GPU).\n",
                 "Technically you can run this notebook on Google Colab but you need to set up microphone for Colab.\n",
-                " \n",
+                "\n",
                 "Instructions for setting up Colab are as follows:\n",
                 "1. Open a new Python 3 notebook.\n",
                 "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
@@ -30,10 +30,7 @@
                 "\n",
                 "# ## Install NeMo\n",
                 "BRANCH = 'main'\n",
-                "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
-                "\n",
-                "## Install TorchAudio\n",
-                "!pip install torchaudio>=0.13.0 -f https://download.pytorch.org/whl/torch_stable.html"
+                "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]"
             ]
         },
         {
@@ -65,13 +62,6 @@
                 "```"
             ]
         },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "This notebook requires the `torchaudio` library to be installed for MarbleNet. Please follow the instructions available at the [torchaudio installer](https://github.com/NVIDIA/NeMo/blob/main/scripts/installers/install_torchaudio_latest.sh) and [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.\n"
-            ]
-        },
         {
             "cell_type": "code",
             "execution_count": null,
@@ -315,17 +305,17 @@
                 "        super().__init__()\n",
                 "        self._sample_rate = sample_rate\n",
                 "        self.output = True\n",
-                "        \n",
+                "\n",
                 "    def __iter__(self):\n",
                 "        return self\n",
-                "    \n",
+                "\n",
                 "    def __next__(self):\n",
                 "        if not self.output:\n",
                 "            raise StopIteration\n",
                 "        self.output = False\n",
                 "        return torch.as_tensor(self.signal, dtype=torch.float32), \\\n",
                 "               torch.as_tensor(self.signal_shape, dtype=torch.int64)\n",
-                "        \n",
+                "\n",
                 "    def set_signal(self, signal):\n",
                 "        self.signal = signal.astype(np.float32)/32768.\n",
                 "        self.signal_shape = self.signal.size\n",
@@ -373,10 +363,10 @@
                 "#    contiguous signal's frames\n",
                 "# To simplify the flow, we use single threshold to binarize predictions.\n",
                 "class FrameVAD:\n",
-                "    \n",
+                "\n",
                 "    def __init__(self, model_definition,\n",
                 "                 threshold=0.5,\n",
-                "                 frame_len=2, frame_overlap=2.5, \n",
+                "                 frame_len=2, frame_overlap=2.5,\n",
                 "                 offset=10):\n",
                 "        '''\n",
                 "        Args:\n",
@@ -387,7 +377,7 @@
                 "        '''\n",
                 "        self.vocab = list(model_definition['labels'])\n",
                 "        self.vocab.append('_')\n",
-                "        \n",
+                "\n",
                 "        self.sr = model_definition['sample_rate']\n",
                 "        self.threshold = threshold\n",
                 "        self.frame_len = frame_len\n",
@@ -401,7 +391,7 @@
                 "                               dtype=np.float32)\n",
                 "        self.offset = offset\n",
                 "        self.reset()\n",
-                "        \n",
+                "\n",
                 "    def _decode(self, frame, offset=0):\n",
                 "        assert len(frame)==self.n_frame_len\n",
                 "        self.buffer[:-self.n_frame_len] = self.buffer[self.n_frame_len:]\n",
@@ -412,9 +402,9 @@
                 "            logits,\n",
                 "            self.vocab\n",
                 "        )\n",
-                "        return decoded  \n",
-                "    \n",
-                "    \n",
+                "        return decoded\n",
+                "\n",
+                "\n",
                 "    @torch.no_grad()\n",
                 "    def transcribe(self, frame=None):\n",
                 "        if frame is None:\n",
@@ -423,7 +413,7 @@
                 "            frame = np.pad(frame, [0, self.n_frame_len - len(frame)], 'constant')\n",
                 "        unmerged = self._decode(frame, self.offset)\n",
                 "        return unmerged\n",
-                "    \n",
+                "\n",
                 "    def reset(self):\n",
                 "        '''\n",
                 "        Reset frame_history and decoder's state\n",
@@ -471,14 +461,14 @@
                 "import wave\n",
                 "\n",
                 "def offline_inference(wave_file, STEP = 0.025, WINDOW_SIZE = 0.5, threshold=0.5):\n",
-                "    \n",
-                "    FRAME_LEN = STEP # infer every STEP seconds \n",
+                "\n",
+                "    FRAME_LEN = STEP # infer every STEP seconds\n",
                 "    CHANNELS = 1 # number of audio channels (expect mono signal)\n",
                 "    RATE = 16000 # sample rate, Hz\n",
-                "    \n",
-                "   \n",
+                "\n",
+                "\n",
                 "    CHUNK_SIZE = int(FRAME_LEN*RATE)\n",
-                "    \n",
+                "\n",
                 "    vad = FrameVAD(model_definition = {\n",
                 "                   'sample_rate': SAMPLE_RATE,\n",
                 "                   'AudioToMFCCPreprocessor': cfg.preprocessor,\n",
@@ -509,7 +499,7 @@
                 "        preds.append(result[0])\n",
                 "        proba_b.append(result[2])\n",
                 "        proba_s.append(result[3])\n",
-                "        \n",
+                "\n",
                 "        if len(result):\n",
                 "            print(result,end='\\n')\n",
                 "            empty_counter = 3\n",
@@ -517,10 +507,10 @@
                 "            empty_counter -= 1\n",
                 "            if empty_counter == 0:\n",
                 "                print(' ',end='')\n",
-                "                \n",
+                "\n",
                 "    p.terminate()\n",
                 "    vad.reset()\n",
-                "    \n",
+                "\n",
                 "    return preds, proba_b, proba_s"
             ]
         },
@@ -542,7 +532,7 @@
             "source": [
                 "demo_wave = 'VAD_demo.wav'\n",
                 "if not os.path.exists(demo_wave):\n",
-                "    !wget \"https://dldata-public.s3.us-east-2.amazonaws.com/VAD_demo.wav\" "
+                "    !wget \"https://dldata-public.s3.us-east-2.amazonaws.com/VAD_demo.wav\""
             ]
         },
         {
@@ -612,12 +602,12 @@
                 "\n",
                 "num = len(results)\n",
                 "for i in range(num):\n",
-                "    len_pred = len(results[i][2]) \n",
+                "    len_pred = len(results[i][2])\n",
                 "    FRAME_LEN = results[i][0]\n",
                 "    ax1 = plt.subplot(num+1,1,i+1)\n",
                 "\n",
                 "    ax1.plot(np.arange(audio.size) / sample_rate, audio, 'b')\n",
-                "    ax1.set_xlim([-0.01, int(dur)+1]) \n",
+                "    ax1.set_xlim([-0.01, int(dur)+1])\n",
                 "    ax1.tick_params(axis='y', labelcolor= 'b')\n",
                 "    ax1.set_ylabel('Signal')\n",
                 "    ax1.set_ylim([-1,  1])\n",
@@ -633,8 +623,8 @@
                 "\n",
                 "    ax2.set_title(f'step {results[i][0]}s, buffer size {results[i][1]}s')\n",
                 "    ax2.set_ylabel('Preds and Probas')\n",
-                "    \n",
-                "    \n",
+                "\n",
+                "\n",
                 "ax = plt.subplot(num+1,1,num+1)\n",
                 "S = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=64, fmax=8000)\n",
                 "S_dB = librosa.power_to_db(S, ref=np.max)\n",
@@ -664,9 +654,9 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "STEP = 0.01 \n",
+                "STEP = 0.01\n",
                 "WINDOW_SIZE = 0.31\n",
-                "CHANNELS = 1 \n",
+                "CHANNELS = 1\n",
                 "RATE = 16000\n",
                 "FRAME_LEN = STEP\n",
                 "THRESHOLD = 0.5\n",
@@ -679,7 +669,7 @@
                 "                   'labels': cfg.labels\n",
                 "               },\n",
                 "               threshold=THRESHOLD,\n",
-                "               frame_len=FRAME_LEN, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2, \n",
+                "               frame_len=FRAME_LEN, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2,\n",
                 "               offset=0)\n"
             ]
         },
@@ -732,19 +722,19 @@
                 "    print('Listening...')\n",
                 "\n",
                 "    stream.start_stream()\n",
-                "    \n",
+                "\n",
                 "    # Interrupt kernel and then speak for a few more words to exit the pyaudio loop !\n",
                 "    try:\n",
                 "        while stream.is_active():\n",
                 "            time.sleep(0.1)\n",
-                "    finally:        \n",
+                "    finally:\n",
                 "        stream.stop_stream()\n",
                 "        stream.close()\n",
                 "        p.terminate()\n",
                 "\n",
                 "        print()\n",
                 "        print(\"PyAudio stopped\")\n",
-                "    \n",
+                "\n",
                 "else:\n",
                 "    print(\"ERROR: No audio input device found, please check if the jupyter notebook has access to your computer's microphone.\")"
             ]
diff --git a/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb b/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb
index 858f162b1834..c7dec1ce8811 100644
--- a/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb
+++ b/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb
@@ -9,16 +9,18 @@
             "outputs": [],
             "source": [
                 "\"\"\"\n",
-                "Please run notebook locally (if you have all the dependencies and a GPU). \n",
+                "Please run notebook locally (if you have all the dependencies and a GPU).\n",
                 "Technically you can run this notebook on Google Colab but you need to set up microphone for Colab.\n",
-                " \n",
+                "\n",
                 "Instructions for setting up Colab are as follows:\n",
                 "1. Open a new Python 3 notebook.\n",
                 "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
                 "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
                 "4. Run this cell to set up dependencies.\n",
                 "5. Set up microphone for Colab\n",
-                "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
+                "\n",
+                "\n",
+                "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
                 "\"\"\"\n",
                 "# If you're using Google Colab and not running locally, run this cell.\n",
                 "\n",
@@ -30,10 +32,7 @@
                 "\n",
                 "# ## Install NeMo\n",
                 "BRANCH = 'main'\n",
-                "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
-                "\n",
-                "## Install TorchAudio\n",
-                "!pip install torchaudio>=0.13.0 -f https://download.pytorch.org/whl/torch_stable.html"
+                "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]"
             ]
         },
         {
@@ -55,13 +54,6 @@
                 "```"
             ]
         },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "This notebook requires the `torchaudio` library to be installed for MatchboxNet. Please follow the instructions available at the [torchaudio installer](https://github.com/NVIDIA/NeMo/blob/main/scripts/installers/install_torchaudio_latest.sh) and [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.\n"
-            ]
-        },
         {
             "cell_type": "code",
             "execution_count": null,
@@ -243,17 +235,17 @@
                 "        super().__init__()\n",
                 "        self._sample_rate = sample_rate\n",
                 "        self.output = True\n",
-                "        \n",
+                "\n",
                 "    def __iter__(self):\n",
                 "        return self\n",
-                "    \n",
+                "\n",
                 "    def __next__(self):\n",
                 "        if not self.output:\n",
                 "            raise StopIteration\n",
                 "        self.output = False\n",
                 "        return torch.as_tensor(self.signal, dtype=torch.float32), \\\n",
                 "               torch.as_tensor(self.signal_shape, dtype=torch.int64)\n",
-                "        \n",
+                "\n",
                 "    def set_signal(self, signal):\n",
                 "        self.signal = signal.astype(np.float32)/32768.\n",
                 "        self.signal_shape = self.signal.size\n",
@@ -313,9 +305,9 @@
                 "# 2) call transcribe(frame) to do ASR on\n",
                 "#    contiguous signal's frames\n",
                 "class FrameASR:\n",
-                "    \n",
+                "\n",
                 "    def __init__(self, model_definition,\n",
-                "                 frame_len=2, frame_overlap=2.5, \n",
+                "                 frame_len=2, frame_overlap=2.5,\n",
                 "                 offset=0):\n",
                 "        '''\n",
                 "        Args:\n",
@@ -325,7 +317,7 @@
                 "        '''\n",
                 "        self.task = model_definition['task']\n",
                 "        self.vocab = list(model_definition['labels'])\n",
-                "        \n",
+                "\n",
                 "        self.sr = model_definition['sample_rate']\n",
                 "        self.frame_len = frame_len\n",
                 "        self.n_frame_len = int(frame_len * self.sr)\n",
@@ -338,7 +330,7 @@
                 "                               dtype=np.float32)\n",
                 "        self.offset = offset\n",
                 "        self.reset()\n",
-                "        \n",
+                "\n",
                 "    @torch.no_grad()\n",
                 "    def _decode(self, frame, offset=0):\n",
                 "        assert len(frame)==self.n_frame_len\n",
@@ -348,16 +340,16 @@
                 "        if self.task == 'mbn':\n",
                 "            logits = infer_signal(mbn_model, self.buffer).to('cpu').numpy()[0]\n",
                 "            decoded = self._mbn_greedy_decoder(logits, self.vocab)\n",
-                "            \n",
+                "\n",
                 "        elif self.task == 'vad':\n",
                 "            logits = infer_signal(vad_model, self.buffer).to('cpu').numpy()[0]\n",
                 "            decoded = self._vad_greedy_decoder(logits, self.vocab)\n",
-                "           \n",
+                "\n",
                 "        else:\n",
                 "            raise(\"Task should either be of mbn or vad!\")\n",
-                "            \n",
+                "\n",
                 "        return decoded[:len(decoded)-offset]\n",
-                "    \n",
+                "\n",
                 "    def transcribe(self, frame=None,merge=False):\n",
                 "        if frame is None:\n",
                 "            frame = np.zeros(shape=self.n_frame_len, dtype=np.float32)\n",
@@ -365,8 +357,8 @@
                 "            frame = np.pad(frame, [0, self.n_frame_len - len(frame)], 'constant')\n",
                 "        unmerged = self._decode(frame, self.offset)\n",
                 "        return unmerged\n",
-                "        \n",
-                "    \n",
+                "\n",
+                "\n",
                 "    def reset(self):\n",
                 "        '''\n",
                 "        Reset frame_history and decoder's state\n",
@@ -374,17 +366,17 @@
                 "        self.buffer=np.zeros(shape=self.buffer.shape, dtype=np.float32)\n",
                 "        self.mbn_s = []\n",
                 "        self.vad_s = []\n",
-                "        \n",
+                "\n",
                 "    @staticmethod\n",
                 "    def _mbn_greedy_decoder(logits, vocab):\n",
                 "        mbn_s = []\n",
                 "        if logits.shape[0]:\n",
                 "            class_idx = np.argmax(logits)\n",
                 "            class_label = vocab[class_idx]\n",
-                "            mbn_s.append(class_label)         \n",
+                "            mbn_s.append(class_label)\n",
                 "        return mbn_s\n",
-                "    \n",
-                "    \n",
+                "\n",
+                "\n",
                 "    @staticmethod\n",
                 "    def _vad_greedy_decoder(logits, vocab):\n",
                 "        vad_s = []\n",
@@ -439,16 +431,16 @@
                 "    \"\"\"\n",
                 "    Arg:\n",
                 "        wav_file: wave file to be performed inference on.\n",
-                "        STEP: infer every STEP seconds \n",
+                "        STEP: infer every STEP seconds\n",
                 "        WINDOW_SIZE : length of audio to be sent to NN.\n",
                 "    \"\"\"\n",
-                "    \n",
-                "    FRAME_LEN = STEP \n",
+                "\n",
+                "    FRAME_LEN = STEP\n",
                 "    CHANNELS = 1 # number of audio channels (expect mono signal)\n",
                 "    RATE = SAMPLE_RATE # sample rate, 16000 Hz\n",
-                "   \n",
+                "\n",
                 "    CHUNK_SIZE = int(FRAME_LEN * SAMPLE_RATE)\n",
-                "    \n",
+                "\n",
                 "    mbn = FrameASR(model_definition = {\n",
                 "                       'task': 'mbn',\n",
                 "                       'sample_rate': SAMPLE_RATE,\n",
@@ -467,10 +459,10 @@
                 "        data = wf.readframes(CHUNK_SIZE)\n",
                 "        signal = np.frombuffer(data, dtype=np.int16)\n",
                 "        mbn_result = mbn.transcribe(signal)\n",
-                "        \n",
+                "\n",
                 "        if len(mbn_result):\n",
                 "            print(mbn_result)\n",
-                "            \n",
+                "\n",
                 "    mbn.reset()"
             ]
         },
@@ -545,13 +537,13 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "vad_threshold = 0.8 \n",
+                "vad_threshold = 0.8\n",
                 "\n",
-                "STEP = 0.1 \n",
+                "STEP = 0.1\n",
                 "WINDOW_SIZE = 0.15\n",
                 "mbn_WINDOW_SIZE = 1\n",
                 "\n",
-                "CHANNELS = 1 \n",
+                "CHANNELS = 1\n",
                 "RATE = SAMPLE_RATE\n",
                 "FRAME_LEN = STEP # use step of vad inference as frame len\n",
                 "\n",
@@ -563,7 +555,7 @@
                 "                   'JasperEncoder': vad_cfg.encoder,\n",
                 "                   'labels': vad_cfg.labels\n",
                 "               },\n",
-                "               frame_len=FRAME_LEN, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2, \n",
+                "               frame_len=FRAME_LEN, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2,\n",
                 "               offset=0)\n",
                 "\n",
                 "mbn = FrameASR(model_definition = {\n",
@@ -602,19 +594,19 @@
                 "        print('Please type input device ID:')\n",
                 "        dev_idx = int(input())\n",
                 "\n",
-                "    \n",
+                "\n",
                 "    def callback(in_data, frame_count, time_info, status):\n",
                 "        \"\"\"\n",
                 "        callback function for streaming audio and performing inference\n",
                 "        \"\"\"\n",
                 "        signal = np.frombuffer(in_data, dtype=np.int16)\n",
-                "        vad_result = vad.transcribe(signal) \n",
-                "        mbn_result = mbn.transcribe(signal) \n",
-                "        \n",
+                "        vad_result = vad.transcribe(signal)\n",
+                "        mbn_result = mbn.transcribe(signal)\n",
+                "\n",
                 "        if len(vad_result):\n",
-                "            # if speech prob is higher than threshold, we decide it contains speech utterance \n",
-                "            # and activate MatchBoxNet \n",
-                "            if vad_result[3] >= vad_threshold: \n",
+                "            # if speech prob is higher than threshold, we decide it contains speech utterance\n",
+                "            # and activate MatchBoxNet\n",
+                "            if vad_result[3] >= vad_threshold:\n",
                 "                print(mbn_result) # print mbn result when speech present\n",
                 "            else:\n",
                 "                print(\"no-speech\")\n",
@@ -629,21 +621,21 @@
                 "                    stream_callback=callback,\n",
                 "                    frames_per_buffer=CHUNK_SIZE)\n",
                 "\n",
-                "    \n",
+                "\n",
                 "    print('Listening...')\n",
                 "    stream.start_stream()\n",
-                "    \n",
+                "\n",
                 "    # Interrupt kernel and then speak for a few more words to exit the pyaudio loop !\n",
                 "    try:\n",
                 "        while stream.is_active():\n",
                 "            time.sleep(0.1)\n",
-                "    finally:        \n",
+                "    finally:\n",
                 "        stream.stop_stream()\n",
                 "        stream.close()\n",
                 "        p.terminate()\n",
                 "        print()\n",
                 "        print(\"PyAudio stopped\")\n",
-                "    \n",
+                "\n",
                 "else:\n",
                 "    print('ERROR: No audio input device found.')"
             ]
diff --git a/tutorials/asr/Streaming_Multitalker_ASR.ipynb b/tutorials/asr/Streaming_Multitalker_ASR.ipynb
index 9d9bb76bd8ef..53b1cfaf3190 100644
--- a/tutorials/asr/Streaming_Multitalker_ASR.ipynb
+++ b/tutorials/asr/Streaming_Multitalker_ASR.ipynb
@@ -27,10 +27,7 @@
     "\n",
     "# ## Install NeMo\n",
     "BRANCH = 'main'\n",
-    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n",
-    "\n",
-    "## Install TorchAudio\n",
-    "!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html"
+    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]"
    ]
   },
   {
@@ -145,7 +142,7 @@
     "import librosa\n",
     "\n",
     "sr = 16000\n",
-    "signal, sr = librosa.load(an4_audio, sr=sr) \n",
+    "signal, sr = librosa.load(an4_audio, sr=sr)\n",
     "\n",
     "fig, ax = plt.subplots(1, 1)\n",
     "fig.set_figwidth(20)\n",
@@ -187,7 +184,7 @@
     "import torch\n",
     "\n",
     "if get_hf_token() is not None and get_hf_token().startswith(\"hf_\"):\n",
-    "    # If you have logged into HuggingFace hub and have access token \n",
+    "    # If you have logged into HuggingFace hub and have access token\n",
     "    diar_model = SortformerEncLabelModel.from_pretrained(\"nvidia/diar_streaming_sortformer_4spk-v2\")\n",
     "else:\n",
     "    # You can download \".nemo\" file from https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2 and specify the path.\n",
@@ -221,7 +218,7 @@
     "import math\n",
     "import torch\n",
     "import torch.amp\n",
-    "from tqdm import tqdm \n",
+    "from tqdm import tqdm\n",
     "\n",
     "# If cuda is available, assign the model to cuda\n",
     "if torch.cuda.is_available():\n",
@@ -350,9 +347,9 @@
     "\n",
     "    yticklabels = [\"spk0\", \"spk1\", \"spk2\", \"spk3\"]\n",
     "    yticks = np.arange(len(yticklabels))\n",
-    "    fig, axs = plt.subplots(1, 1, figsize=(30, 3)) \n",
+    "    fig, axs = plt.subplots(1, 1, figsize=(30, 3))\n",
     "\n",
-    "    axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest') \n",
+    "    axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest')\n",
     "    axs.set_title('Diarization Predictions (Speaker Activity)', fontsize=FS)\n",
     "    axs.set_xticks(np.arange(-.5, preds_mat.shape[1], 1), minor=True)\n",
     "    axs.set_yticks(yticks)\n",
@@ -383,9 +380,9 @@
    "source": [
     "from nemo.collections.asr.models import ASRModel\n",
     "import torch\n",
-    "    \n",
+    "\n",
     "if get_hf_token() is not None and get_hf_token().startswith(\"hf_\"):\n",
-    "    # If you have logged into HuggingFace hub and have access token \n",
+    "    # If you have logged into HuggingFace hub and have access token\n",
     "    asr_model = ASRModel.from_pretrained(\"nvidia/multitalker-parakeet-streaming-0.6b-v1\")\n",
     "else:\n",
     "    # You can download \".nemo\" file from https://huggingface.co/nvidia/multitalker-parakeet-streaming-0.6b-v1 and specify the path.\n",
@@ -395,7 +392,7 @@
     "asr_model.eval()\n",
     "if torch.cuda.is_available():\n",
     "    asr_model.to(torch.device(\"cuda\"))\n",
-    "    \n",
+    "\n",
     "print(\"ASR Model loaded successfully!\")"
    ]
   },
@@ -497,7 +494,7 @@
     "\n",
     "    # If `cuda` is a negative number, inference will be on CPU only.\n",
     "    cuda: Optional[int] = None\n",
-    "    allow_mps: bool = False  \n",
+    "    allow_mps: bool = False\n",
     "    matmul_precision: str = \"highest\"  # Literal[\"highest\", \"high\", \"medium\"]\n",
     "\n",
     "    # ASR Configs\n",
@@ -669,7 +666,7 @@
     "                    drop_extra_pre_encoded=drop_extra_pre_encoded,\n",
     "                )\n",
     "                pprint(multispk_asr_streamer.instance_manager.batch_asr_states[0].seglsts)\n",
-    "                \n",
+    "\n",
     "seglst_dict_list = multispk_asr_streamer.generate_seglst_dicts_from_parallel_streaming(samples=samples)\n",
     "\n",
     "from pprint import pprint\n",
@@ -706,9 +703,9 @@
     "        end_time = seglst.get('end_time', 0.0)\n",
     "        words = seglst.get('words', '')\n",
     "        session_id = seglst.get('session_id', '')\n",
-    "        \n",
+    "\n",
     "        print(f\"[{idx+1}] {speaker} ({start_time:.2f}s - {end_time:.2f}s): {words}\")\n",
-    "    \n",
+    "\n",
     "    print(f\"\\n{'-'*80}\")\n",
     "    print(f\"Total segments: {len(seglst_dict_list)}\")\n",
     "else:\n",
diff --git a/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb b/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb
index cdcfcc82b786..b8fab09b5787 100644
--- a/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb
+++ b/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb
@@ -31,10 +31,7 @@
     "\n",
     "# ## Install NeMo\n",
     "BRANCH = 'main'\n",
-    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n",
-    "\n",
-    "## Install TorchAudio\n",
-    "!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html"
+    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]"
    ]
   },
   {
@@ -144,7 +141,7 @@
     "    plt.axis([0,len(signal),-0.5,+0.5])\n",
     "    time_axis,_ = plt.xticks();\n",
     "    plt.xticks(time_axis[:-1],time_axis[:-1]/sample_rate);\n",
-    "    \n",
+    "\n",
     "COLORS=\"b g c m y\".split()\n",
     "\n",
     "def get_color(signal,speech_labels,sample_rate=16000):\n",
@@ -157,8 +154,8 @@
     "        else:\n",
     "            code = COLORS[int(label.split('_')[-1])]\n",
     "        c[start:end]=code\n",
-    "    \n",
-    "    return c "
+    "\n",
+    "    return c"
    ]
   },
   {
@@ -238,18 +235,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Create a manifest file for input with below format. \n",
-    "# {\"audio_filepath\": \"/path/to/audio_file\", \"offset\": 0, \"duration\": null, \"label\": \"infer\", \"text\": \"-\", \n",
+    "# Create a manifest file for input with below format.\n",
+    "# {\"audio_filepath\": \"/path/to/audio_file\", \"offset\": 0, \"duration\": null, \"label\": \"infer\", \"text\": \"-\",\n",
     "# \"num_speakers\": null, \"rttm_filepath\": \"/path/to/rttm/file\", \"uem_filepath\"=\"/path/to/uem/filepath\"}\n",
     "import json\n",
     "meta = {\n",
-    "    'audio_filepath': AUDIO_FILENAME, \n",
-    "    'offset': 0, \n",
-    "    'duration':None, \n",
-    "    'label': 'infer', \n",
-    "    'text': '-', \n",
-    "    'num_speakers': None, \n",
-    "    'rttm_filepath': None, \n",
+    "    'audio_filepath': AUDIO_FILENAME,\n",
+    "    'offset': 0,\n",
+    "    'duration':None,\n",
+    "    'label': 'infer',\n",
+    "    'text': '-',\n",
+    "    'num_speakers': None,\n",
+    "    'rttm_filepath': None,\n",
     "    'uem_filepath' : None\n",
     "}\n",
     "with open(os.path.join(data_dir,'input_manifest.json'),'w') as fp:\n",
@@ -279,10 +276,10 @@
     "cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model\n",
     "cfg.diarizer.clustering.parameters.oracle_num_speakers=False\n",
     "\n",
-    "# Using Neural VAD and Conformer ASR \n",
+    "# Using Neural VAD and Conformer ASR\n",
     "cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'\n",
     "cfg.diarizer.asr.model_path = 'stt_en_conformer_ctc_large'\n",
-    "cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD \n",
+    "cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD\n",
     "cfg.diarizer.asr.parameters.asr_based_vad = False"
    ]
   },
@@ -576,13 +573,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def write_ctm(path, the_list): \n",
+    "def write_ctm(path, the_list):\n",
     "    outF = open(path, \"w\")\n",
     "    for line in the_list:\n",
     "        outF.write(line)\n",
     "        outF.write(\"\\n\")\n",
     "    outF.close()\n",
-    "    \n",
+    "\n",
     "write_ctm(f\"{data_dir}/an4_diarize_test.ctm\", an4_diarize_test_ctm)"
    ]
   },
@@ -603,7 +600,7 @@
     "from nemo.collections.asr.metrics.der import concat_perm_word_error_rate\n",
     "from nemo.collections.asr.metrics.wer import word_error_rate\n",
     "from nemo.collections.asr.parts.utils.diarization_utils import convert_word_dict_seq_to_text, convert_ctm_to_text\n",
-    "# Provide a list containing the paths to the reference CTM files \n",
+    "# Provide a list containing the paths to the reference CTM files\n",
     "# which have the same order with filenames in word_seq_lists.\n",
     "\n",
     "word_seq_list = trans_info_dict['an4_diarize_test']['words']\n",
@@ -633,7 +630,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from nemo.collections.asr.metrics.der import concat_perm_word_error_rate \n",
+    "from nemo.collections.asr.metrics.der import concat_perm_word_error_rate\n",
     "from nemo.collections.asr.metrics.wer import word_error_rate\n",
     "\n",
     "cpWER, concat_hyp, concat_ref = concat_perm_word_error_rate([spk_hypothesis], [spk_reference])\n",
@@ -711,14 +708,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Create a new manifest file for input with the reference CTM file. \n",
+    "# Create a new manifest file for input with the reference CTM file.\n",
     "meta = {\n",
-    "    'audio_filepath': AUDIO_FILENAME, \n",
-    "    'offset': 0, \n",
-    "    'duration':None, \n",
-    "    'label': 'infer', \n",
-    "    'text': '-', \n",
-    "    'num_speakers': 2, \n",
+    "    'audio_filepath': AUDIO_FILENAME,\n",
+    "    'offset': 0,\n",
+    "    'duration':None,\n",
+    "    'label': 'infer',\n",
+    "    'text': '-',\n",
+    "    'num_speakers': 2,\n",
     "    'rttm_filepath': None,\n",
     "    'ctm_filepath': f\"{data_dir}/an4_diarize_test.ctm\",\n",
     "    'uem_filepath' : None\n",
@@ -731,7 +728,7 @@
     "cfg.diarizer.manifest_filepath = os.path.join(data_dir,'input_manifest.json')\n",
     "!cat {cfg.diarizer.manifest_filepath}\n",
     "\n",
-    "# We need to call `make_file_lists` again to update manifest file to `asr_diar_offline` instance \n",
+    "# We need to call `make_file_lists` again to update manifest file to `asr_diar_offline` instance\n",
     "asr_diar_offline.make_file_lists()"
    ]
   },
@@ -799,7 +796,7 @@
     "        shutil.copyfileobj(f_in, f_out)\n",
     "        f_in.close()\n",
     "        f_out.close()\n",
-    "        \n",
+    "\n",
     "ARPA_URL = 'https://kaldi-asr.org/models/5/4gram_big.arpa.gz'\n",
     "f = wget.download(ARPA_URL, data_dir)\n",
     "gunzip(f,f.replace(\".gz\",\"\"))"
diff --git a/tutorials/speaker_tasks/End_to_End_Diarization_Inference.ipynb b/tutorials/speaker_tasks/End_to_End_Diarization_Inference.ipynb
index 273cc00c1f56..c2c46674582a 100644
--- a/tutorials/speaker_tasks/End_to_End_Diarization_Inference.ipynb
+++ b/tutorials/speaker_tasks/End_to_End_Diarization_Inference.ipynb
@@ -25,10 +25,7 @@
     "\n",
     "# ## Install NeMo\n",
     "BRANCH = 'main'\n",
-    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n",
-    "\n",
-    "## Install TorchAudio\n",
-    "!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html"
+    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]"
    ]
   },
   {
@@ -136,7 +133,7 @@
     "import librosa\n",
     "\n",
     "sr = 16000\n",
-    "signal, sr = librosa.load(an4_audio,sr=sr) \n",
+    "signal, sr = librosa.load(an4_audio,sr=sr)\n",
     "\n",
     "fig,ax = plt.subplots(1,1)\n",
     "fig.set_figwidth(20)\n",
@@ -186,7 +183,7 @@
     "import torch\n",
     "\n",
     "if get_hf_token() is not None and get_hf_token().startswith(\"hf_\"):\n",
-    "    # If you have logged into HuggingFace hub and have access token \n",
+    "    # If you have logged into HuggingFace hub and have access token\n",
     "    diar_model = SortformerEncLabelModel.from_pretrained(\"nvidia/diar_sortformer_4spk-v1\")\n",
     "else:\n",
     "    # You can downloaded \".nemo\" file from https://huggingface.co/nvidia/diar_sortformer_4spk-v1 and specify the path.\n",
@@ -225,9 +222,9 @@
     "\n",
     "    yticklabels = [\"spk0\", \"spk1\", \"spk2\", \"spk3\"]\n",
     "    yticks = np.arange(len(yticklabels))\n",
-    "    fig, axs = plt.subplots(1, 1, figsize=(30, 3)) \n",
+    "    fig, axs = plt.subplots(1, 1, figsize=(30, 3))\n",
     "\n",
-    "    axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest') \n",
+    "    axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest')\n",
     "    axs.set_title('Predictions', fontsize=FS)\n",
     "    axs.set_xticks(np.arange(-.5, preds_mat.shape[1], 1), minor=True)\n",
     "    axs.set_yticks(yticks)\n",
@@ -235,7 +232,7 @@
     "    axs.set_xlabel(f\"80 ms Frames\", fontsize=FS)\n",
     "    axs.grid(which='minor', color=grid_color_p, linestyle='-', linewidth=LW)\n",
     "\n",
-    "    plt.savefig('plot.png', dpi=300) \n",
+    "    plt.savefig('plot.png', dpi=300)\n",
     "    plt.show()\n",
     "\n",
     "\n",
@@ -297,7 +294,7 @@
    "source": [
     "from nemo.collections.asr.parts.utils.vad_utils import load_postprocessing_from_yaml\n",
     "import json\n",
-    "from omegaconf import OmegaConf \n",
+    "from omegaconf import OmegaConf\n",
     "post_processing_params = load_postprocessing_from_yaml(MODEL_CONFIG)\n",
     "print(json.dumps(OmegaConf.to_container(post_processing_params), indent=4))"
    ]
diff --git a/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb b/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb
index cf3416c4ad02..7d46393d0706 100644
--- a/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb
+++ b/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb
@@ -25,10 +25,7 @@
     "\n",
     "# ## Install NeMo\n",
     "BRANCH = 'main'\n",
-    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n",
-    "\n",
-    "## Install TorchAudio\n",
-    "!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html"
+    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]"
    ]
   },
   {
@@ -249,7 +246,7 @@
     "import librosa\n",
     "\n",
     "sr = 16000\n",
-    "signal, sr = librosa.load(an4_audio,sr=sr) \n",
+    "signal, sr = librosa.load(an4_audio,sr=sr)\n",
     "\n",
     "fig,ax = plt.subplots(1,1)\n",
     "fig.set_figwidth(20)\n",
@@ -341,18 +338,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Create a manifest for input with below format. \n",
-    "# {'audio_filepath': /path/to/audio_file, 'offset': 0, 'duration':None, 'label': 'infer', 'text': '-', \n",
+    "# Create a manifest for input with below format.\n",
+    "# {'audio_filepath': /path/to/audio_file, 'offset': 0, 'duration':None, 'label': 'infer', 'text': '-',\n",
     "# 'num_speakers': None, 'rttm_filepath': /path/to/rttm/file, 'uem_filepath'='/path/to/uem/filepath'}\n",
     "import json\n",
     "meta = {\n",
-    "    'audio_filepath': an4_audio, \n",
-    "    'offset': 0, \n",
-    "    'duration':None, \n",
-    "    'label': 'infer', \n",
-    "    'text': '-', \n",
-    "    'num_speakers': 2, \n",
-    "    'rttm_filepath': an4_rttm, \n",
+    "    'audio_filepath': an4_audio,\n",
+    "    'offset': 0,\n",
+    "    'duration':None,\n",
+    "    'label': 'infer',\n",
+    "    'text': '-',\n",
+    "    'num_speakers': 2,\n",
+    "    'rttm_filepath': an4_rttm,\n",
     "    'uem_filepath' : None\n",
     "}\n",
     "with open('data/input_manifest.json','w') as fp:\n",
@@ -426,10 +423,10 @@
     "config.diarizer.out_dir = output_dir # Directory to store intermediate files and prediction outputs\n",
     "pretrained_speaker_model = 'titanet_large'\n",
     "config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model\n",
-    "config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5] \n",
-    "config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1] \n",
-    "config.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1] \n",
-    "config.diarizer.oracle_vad = True # ----> ORACLE VAD \n",
+    "config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5]\n",
+    "config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1]\n",
+    "config.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1]\n",
+    "config.diarizer.oracle_vad = True # ----> ORACLE VAD\n",
     "config.diarizer.clustering.parameters.oracle_num_speakers = False"
    ]
   },
@@ -536,7 +533,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "config.diarizer.msdd_model.model_path = 'diar_msdd_telephonic' # Telephonic speaker diarization model \n",
+    "config.diarizer.msdd_model.model_path = 'diar_msdd_telephonic' # Telephonic speaker diarization model\n",
     "config.diarizer.msdd_model.parameters.sigmoid_threshold = [0.7, 1.0] # Evaluate with T=0.7 and T=1.0"
    ]
   },
@@ -692,7 +689,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "config.num_workers = 1 # Workaround for multiprocessing hanging with ipython issue \n",
+    "config.num_workers = 1 # Workaround for multiprocessing hanging with ipython issue\n",
     "\n",
     "output_dir = os.path.join(ROOT, 'outputs')\n",
     "config.diarizer.manifest_filepath = 'data/input_manifest.json'\n",
@@ -784,10 +781,10 @@
     "\n",
     "plot(\n",
     "    an4_audio,\n",
-    "    vad_output_filepath, \n",
+    "    vad_output_filepath,\n",
     "    an4_rttm,\n",
     "    per_args = config.diarizer.vad.parameters, #threshold\n",
-    "    ) \n",
+    "    )\n",
     "\n",
     "print(f\"VAD params:{OmegaConf.to_yaml(config.diarizer.vad.parameters)}\")"
    ]
@@ -857,7 +854,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "config.diarizer.msdd_model.model_path = 'diar_msdd_telephonic' # Telephonic speaker diarization model \n",
+    "config.diarizer.msdd_model.model_path = 'diar_msdd_telephonic' # Telephonic speaker diarization model\n",
     "config.diarizer.msdd_model.parameters.sigmoid_threshold = [0.7, 1.0] # Evaluate with T=0.7 and T=1.0\n",
     "system_vad_msdd_model = NeuralDiarizer(cfg=config)"
    ]
diff --git a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
index 48f78e0c1c8a..3db99889d92e 100644
--- a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
+++ b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
@@ -28,10 +28,7 @@
     "\n",
     "## Install NeMo\n",
     "BRANCH = 'main'\n",
-    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n",
-    "\n",
-    "# Install TorchAudio\n",
-    "!pip install torchaudio>=0.10.0 -f https://download.pytorch.org/whl/torch_stable.html\n"
+    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]"
    ]
   },
   {
@@ -324,7 +321,7 @@
    "outputs": [],
    "source": [
     "# This line will print the entire config of sample TitaNet model\n",
-    "!mkdir conf \n",
+    "!mkdir conf\n",
     "!wget -P conf https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/speaker_tasks/recognition/conf/titanet-large.yaml\n",
     "MODEL_CONFIG = os.path.join(NEMO_ROOT,'conf/titanet-large.yaml')\n",
     "config = OmegaConf.load(MODEL_CONFIG)\n",
@@ -1120,7 +1117,7 @@
     "\n",
     "    all_embs=[]\n",
     "    out_embeddings = {}\n",
-    "           \n",
+    "\n",
     "    for test_batch in tqdm(speaker_model.test_dataloader()):\n",
     "        test_batch = [x.to(device) for x in test_batch]\n",
     "        audio_signal, audio_signal_len, labels, slices = test_batch\n",
diff --git a/tutorials/speaker_tasks/Streaming_End_to_End_Diarization_Inference.ipynb b/tutorials/speaker_tasks/Streaming_End_to_End_Diarization_Inference.ipynb
index 69aeb96c96c7..30431972d5f0 100644
--- a/tutorials/speaker_tasks/Streaming_End_to_End_Diarization_Inference.ipynb
+++ b/tutorials/speaker_tasks/Streaming_End_to_End_Diarization_Inference.ipynb
@@ -25,10 +25,7 @@
     "\n",
     "# ## Install NeMo\n",
     "BRANCH = 'main'\n",
-    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n",
-    "\n",
-    "## Install TorchAudio\n",
-    "!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html"
+    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]"
    ]
   },
   {
@@ -132,7 +129,7 @@
     "import librosa\n",
     "\n",
     "sr = 16000\n",
-    "signal, sr = librosa.load(an4_audio,sr=sr) \n",
+    "signal, sr = librosa.load(an4_audio,sr=sr)\n",
     "\n",
     "fig,ax = plt.subplots(1,1)\n",
     "fig.set_figwidth(20)\n",
@@ -176,7 +173,7 @@
     "import torch\n",
     "\n",
     "if get_hf_token() is not None and get_hf_token().startswith(\"hf_\"):\n",
-    "    # If you have logged into HuggingFace hub and have access token \n",
+    "    # If you have logged into HuggingFace hub and have access token\n",
     "    diar_model = SortformerEncLabelModel.from_pretrained(\"nvidia/diar_streaming_sortformer_4spk-v2\")\n",
     "else:\n",
     "    # You can downloaded \".nemo\" file from https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2 and specify the path.\n",
@@ -209,9 +206,9 @@
     "\n",
     "    yticklabels = [\"spk0\", \"spk1\", \"spk2\", \"spk3\"]\n",
     "    yticks = np.arange(len(yticklabels))\n",
-    "    fig, axs = plt.subplots(1, 1, figsize=(30, 3)) \n",
+    "    fig, axs = plt.subplots(1, 1, figsize=(30, 3))\n",
     "\n",
-    "    axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest') \n",
+    "    axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest')\n",
     "    axs.set_title('Predictions', fontsize=FS)\n",
     "    axs.set_xticks(np.arange(-.5, preds_mat.shape[1], 1), minor=True)\n",
     "    axs.set_yticks(yticks)\n",
@@ -219,7 +216,7 @@
     "    axs.set_xlabel(f\"80 ms Frames\", fontsize=FS)\n",
     "    axs.grid(which='minor', color=grid_color_p, linestyle='-', linewidth=LW)\n",
     "\n",
-    "    plt.savefig('plot.png', dpi=300) \n",
+    "    plt.savefig('plot.png', dpi=300)\n",
     "    plt.show()"
    ]
   },
@@ -256,7 +253,7 @@
     "import math\n",
     "import torch\n",
     "import torch.amp\n",
-    "from tqdm import tqdm \n",
+    "from tqdm import tqdm\n",
     "\n",
     "# If cuda is available, assign the model to cuda\n",
     "if torch.cuda.is_available():\n",
@@ -374,7 +371,7 @@
     "            )\n",
     "            # plot the predictions\n",
     "            plot_preds[:,:total_preds.shape[1]] = total_preds\n",
-    "            plot_diarout(plot_preds[0,:]) \n",
+    "            plot_diarout(plot_preds[0,:])\n",
     "            time.sleep(chunk_duration_seconds)"
    ]
   }