diff --git a/docker/Dockerfile.speech b/docker/Dockerfile.speech index 9fea05036868..65e55987dc44 100644 --- a/docker/Dockerfile.speech +++ b/docker/Dockerfile.speech @@ -22,8 +22,6 @@ ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3 FROM ${BASE_IMAGE} as nemo-deps # dependency flags; should be declared after FROM -# torchaudio: not required by default -ARG REQUIRE_TORCHAUDIO=false # k2: not required by default ARG REQUIRE_K2=false # ais cli: not required by default, install only if required @@ -96,18 +94,6 @@ WORKDIR /tmp/ # uninstall stuff from base container RUN pip3 uninstall -y sacrebleu torchtext -# build torchaudio -WORKDIR /tmp/torchaudio_build -COPY scripts/installers /tmp/torchaudio_build/scripts/installers/ -RUN INSTALL_MSG=$(/bin/bash /tmp/torchaudio_build/scripts/installers/install_torchaudio_latest.sh); INSTALL_CODE=$?; \ - echo ${INSTALL_MSG}; \ - if [ ${INSTALL_CODE} -ne 0 ]; then \ - echo "torchaudio installation failed"; \ - if [ "${REQUIRE_TORCHAUDIO}" = true ]; then \ - exit ${INSTALL_CODE}; \ - else echo "Skipping failed torchaudio installation"; fi \ - else echo "torchaudio installed successfully"; fi - COPY scripts /tmp/nemo/scripts/ # install correct graphviz version (k2 and pynini visualization tool), skip if installation fails RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_graphviz.sh --docker); INSTALL_CODE=$?; \ diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py index d34461937284..d77a2644e76c 100644 --- a/examples/audio/process_audio.py +++ b/examples/audio/process_audio.py @@ -24,7 +24,7 @@ import torch from omegaconf import OmegaConf -from nemo.collections.audio.models import AudioToAudioModel +from nemo.collections.audio.models.audio_to_audio import AudioToAudioModel from nemo.core.config import hydra_runner from nemo.utils import logging, model_utils diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py index 085fd0e63183..338ca6cd9a67 100644 --- a/nemo/collections/asr/modules/audio_preprocessing.py +++ b/nemo/collections/asr/modules/audio_preprocessing.py @@ -19,11 +19,11 @@ from typing import Any, Optional import torch -from packaging import version from nemo.collections.asr.parts.numba.spec_augment import SpecAugmentNumba, spec_augment_launch_heuristics -from nemo.collections.asr.parts.preprocessing.features import FilterbankFeatures, FilterbankFeaturesTA +from nemo.collections.asr.parts.preprocessing.features import FilterbankFeatures from nemo.collections.asr.parts.submodules.spectr_augment import SpecAugment, SpecCutout +from nemo.collections.audio.parts.utils.transforms import MFCC from nemo.core.classes import Exportable, NeuralModule, typecheck from nemo.core.neural_types import ( AudioSignal, @@ -37,18 +37,6 @@ from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__ from nemo.utils import logging, logging_mode -try: - import torchaudio - import torchaudio.functional - import torchaudio.transforms - - TORCHAUDIO_VERSION = version.parse(torchaudio.__version__) - TORCHAUDIO_VERSION_MIN = version.parse('0.5') - - HAVE_TORCHAUDIO = True -except ModuleNotFoundError: - HAVE_TORCHAUDIO = False - __all__ = [ 'AudioToMelSpectrogramPreprocessor', 'AudioToMFCCPreprocessor', @@ -171,7 +159,6 @@ class AudioToMelSpectrogramPreprocessor(AudioPreprocessor, Exportable): Defaults to 0.0 nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation. Defaults to 4000 - use_torchaudio: Whether to use the `torchaudio` implementation. mel_norm: Normalization used for mel filterbank weights. Defaults to 'slaney' (area normalization) stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints. @@ -237,8 +224,8 @@ def __init__( rng=None, nb_augmentation_prob=0.0, nb_max_freq=4000, - use_torchaudio: bool = False, mel_norm="slaney", + use_torchaudio: bool = False, # Deprecated arguments; kept for config compatibility stft_exact_pad=False, # Deprecated arguments; kept for config compatibility stft_conv=False, # Deprecated arguments; kept for config compatibility ): @@ -256,11 +243,7 @@ def __init__( super().__init__(n_window_size, n_window_stride) # Given the long and similar argument list, point to the class and instantiate it by reference - if not use_torchaudio: - featurizer_class = FilterbankFeatures - else: - featurizer_class = FilterbankFeaturesTA - self.featurizer = featurizer_class( + self.featurizer = FilterbankFeatures( sample_rate=self._sample_rate, n_window_size=n_window_size, n_window_stride=n_window_stride, @@ -306,7 +289,6 @@ def filter_banks(self): class AudioToMFCCPreprocessor(AudioPreprocessor): """Preprocessor that converts wavs to MFCCs. - Uses torchaudio.transforms.MFCC. Args: sample_rate: The sample rate of the audio. @@ -382,14 +364,6 @@ def __init__( log=True, ): self._sample_rate = sample_rate - if not HAVE_TORCHAUDIO: - logging.error('Could not import torchaudio. Some features might not work.') - - raise ModuleNotFoundError( - "torchaudio is not installed but is necessary for " - "AudioToMFCCPreprocessor. We recommend you try " - "building it from source for the PyTorch version you have." - ) if window_size and n_window_size: raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.") if window_stride and n_window_stride: @@ -425,7 +399,7 @@ def __init__( mel_kwargs['window_fn'] = window_fn # Use torchaudio's implementation of MFCCs as featurizer - self.featurizer = torchaudio.transforms.MFCC( + self.featurizer = MFCC( sample_rate=self._sample_rate, n_mfcc=n_mfcc, dct_type=dct_type, @@ -746,8 +720,8 @@ class AudioToMelSpectrogramPreprocessorConfig: rng: Optional[str] = None nb_augmentation_prob: float = 0.0 nb_max_freq: int = 4000 - use_torchaudio: bool = False mel_norm: str = "slaney" + use_torchaudio: bool = False # Deprecated argument, kept for compatibility with older checkpoints. stft_exact_pad: bool = False # Deprecated argument, kept for compatibility with older checkpoints. stft_conv: bool = False # Deprecated argument, kept for compatibility with older checkpoints. diff --git a/nemo/collections/asr/parts/preprocessing/features.py b/nemo/collections/asr/parts/preprocessing/features.py index cffc94d276e3..ec0fa8f6f74d 100644 --- a/nemo/collections/asr/parts/preprocessing/features.py +++ b/nemo/collections/asr/parts/preprocessing/features.py @@ -34,7 +34,6 @@ # This file contains code artifacts adapted from https://github.com/ryanleary/patter import math import random -from typing import Optional, Tuple, Union import librosa import numpy as np @@ -45,14 +44,6 @@ from nemo.collections.asr.parts.preprocessing.segment import AudioSegment from nemo.utils import logging -try: - import torchaudio - - HAVE_TORCHAUDIO = True -except ModuleNotFoundError: - HAVE_TORCHAUDIO = False - - CONSTANT = 1e-5 @@ -499,187 +490,3 @@ def forward(self, x, seq_len, linear_spec=False): if pad_amt != 0: x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value) return x, seq_len - - -class FilterbankFeaturesTA(nn.Module): - """ - Exportable, `torchaudio`-based implementation of Mel Spectrogram extraction. - - See `AudioToMelSpectrogramPreprocessor` for args. - - """ - - def __init__( - self, - sample_rate: int = 16000, - n_window_size: int = 320, - n_window_stride: int = 160, - normalize: Optional[str] = "per_feature", - nfilt: int = 64, - n_fft: Optional[int] = None, - preemph: float = 0.97, - lowfreq: float = 0, - highfreq: Optional[float] = None, - log: bool = True, - log_zero_guard_type: str = "add", - log_zero_guard_value: Union[float, str] = 2**-24, - dither: float = 1e-5, - window: str = "hann", - pad_to: int = 0, - pad_value: float = 0.0, - mel_norm="slaney", - # Seems like no one uses these options anymore. Don't convolute the code by supporting thm. - use_grads: bool = False, # Deprecated arguments; kept for config compatibility - max_duration: float = 16.7, # Deprecated arguments; kept for config compatibility - frame_splicing: int = 1, # Deprecated arguments; kept for config compatibility - exact_pad: bool = False, # Deprecated arguments; kept for config compatibility - nb_augmentation_prob: float = 0.0, # Deprecated arguments; kept for config compatibility - nb_max_freq: int = 4000, # Deprecated arguments; kept for config compatibility - mag_power: float = 2.0, # Deprecated arguments; kept for config compatibility - rng: Optional[random.Random] = None, # Deprecated arguments; kept for config compatibility - stft_exact_pad: bool = False, # Deprecated arguments; kept for config compatibility - stft_conv: bool = False, # Deprecated arguments; kept for config compatibility - ): - super().__init__() - if not HAVE_TORCHAUDIO: - raise ValueError(f"Need to install torchaudio to instantiate a {self.__class__.__name__}") - - # Make sure log zero guard is supported, if given as a string - supported_log_zero_guard_strings = {"eps", "tiny"} - if isinstance(log_zero_guard_value, str) and log_zero_guard_value not in supported_log_zero_guard_strings: - raise ValueError( - f"Log zero guard value must either be a float or a member of {supported_log_zero_guard_strings}" - ) - - # Copied from `AudioPreprocessor` due to the ad-hoc structuring of the Mel Spec extractor class - self.torch_windows = { - 'hann': torch.hann_window, - 'hamming': torch.hamming_window, - 'blackman': torch.blackman_window, - 'bartlett': torch.bartlett_window, - 'ones': torch.ones, - None: torch.ones, - } - - # Ensure we can look up the window function - if window not in self.torch_windows: - raise ValueError(f"Got window value '{window}' but expected a member of {self.torch_windows.keys()}") - - self.win_length = n_window_size - self.hop_length = n_window_stride - self._sample_rate = sample_rate - self._normalize_strategy = normalize - self._use_log = log - self._preemphasis_value = preemph - self.log_zero_guard_type = log_zero_guard_type - self.log_zero_guard_value: Union[str, float] = log_zero_guard_value - self.dither = dither - self.pad_to = pad_to - self.pad_value = pad_value - self.n_fft = n_fft - self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = torchaudio.transforms.MelSpectrogram( - sample_rate=self._sample_rate, - win_length=self.win_length, - hop_length=self.hop_length, - n_mels=nfilt, - window_fn=self.torch_windows[window], - mel_scale="slaney", - norm=mel_norm, - n_fft=n_fft, - f_max=highfreq, - f_min=lowfreq, - wkwargs={"periodic": False}, - ) - - @property - def filter_banks(self): - """Matches the analogous class""" - return self._mel_spec_extractor.mel_scale.fb - - def _resolve_log_zero_guard_value(self, dtype: torch.dtype) -> float: - if isinstance(self.log_zero_guard_value, float): - return self.log_zero_guard_value - return getattr(torch.finfo(dtype), self.log_zero_guard_value) - - def _apply_dithering(self, signals: torch.Tensor) -> torch.Tensor: - if self.training and self.dither > 0.0: - noise = torch.randn_like(signals) * self.dither - signals = signals + noise - return signals - - def _apply_preemphasis(self, signals: torch.Tensor) -> torch.Tensor: - if self._preemphasis_value is not None: - padded = torch.nn.functional.pad(signals, (1, 0)) - signals = signals - self._preemphasis_value * padded[:, :-1] - return signals - - def _compute_output_lengths(self, input_lengths: torch.Tensor) -> torch.Tensor: - out_lengths = input_lengths.div(self.hop_length, rounding_mode="floor").add(1).long() - return out_lengths - - def _apply_pad_to(self, features: torch.Tensor) -> torch.Tensor: - # Only apply during training; else need to capture dynamic shape for exported models - if not self.training or self.pad_to == 0 or features.shape[-1] % self.pad_to == 0: - return features - pad_length = self.pad_to - (features.shape[-1] % self.pad_to) - return torch.nn.functional.pad(features, pad=(0, pad_length), value=self.pad_value) - - def _apply_log(self, features: torch.Tensor) -> torch.Tensor: - if self._use_log: - zero_guard = self._resolve_log_zero_guard_value(features.dtype) - if self.log_zero_guard_type == "add": - features = features + zero_guard - elif self.log_zero_guard_type == "clamp": - features = features.clamp(min=zero_guard) - else: - raise ValueError(f"Unsupported log zero guard type: '{self.log_zero_guard_type}'") - features = features.log() - return features - - def _extract_spectrograms(self, signals: torch.Tensor) -> torch.Tensor: - # Complex FFT needs to be done in single precision - with torch.amp.autocast('cuda', enabled=False): - features = self._mel_spec_extractor(waveform=signals) - return features - - def _apply_normalization(self, features: torch.Tensor, lengths: torch.Tensor, eps: float = 1e-5) -> torch.Tensor: - # For consistency, this function always does a masked fill even if not normalizing. - mask: torch.Tensor = make_seq_mask_like(lengths=lengths, like=features, time_dim=-1, valid_ones=False) - features = features.masked_fill(mask, 0.0) - # Maybe don't normalize - if self._normalize_strategy is None: - return features - # Use the log zero guard for the sqrt zero guard - guard_value = self._resolve_log_zero_guard_value(features.dtype) - if self._normalize_strategy == "per_feature" or self._normalize_strategy == "all_features": - # 'all_features' reduces over each sample; 'per_feature' reduces over each channel - reduce_dim = 2 - if self._normalize_strategy == "all_features": - reduce_dim = [1, 2] - # [B, D, T] -> [B, D, 1] or [B, 1, 1] - means = features.sum(dim=reduce_dim, keepdim=True).div(lengths.view(-1, 1, 1)) - stds = ( - features.sub(means) - .masked_fill(mask, 0.0) - .pow(2.0) - .sum(dim=reduce_dim, keepdim=True) # [B, D, T] -> [B, D, 1] or [B, 1, 1] - .div(lengths.view(-1, 1, 1) - 1) # assume biased estimator - .clamp(min=guard_value) # avoid sqrt(0) - .sqrt() - ) - features = (features - means) / (stds + eps) - else: - # Deprecating constant std/mean - raise ValueError(f"Unsupported norm type: '{self._normalize_strategy}") - features = features.masked_fill(mask, 0.0) - return features - - def forward(self, input_signal: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - feature_lengths = self._compute_output_lengths(input_lengths=length) - signals = self._apply_dithering(signals=input_signal) - signals = self._apply_preemphasis(signals=signals) - features = self._extract_spectrograms(signals=signals) - features = self._apply_log(features=features) - features = self._apply_normalization(features=features, lengths=feature_lengths) - features = self._apply_pad_to(features=features) - return features, feature_lengths diff --git a/nemo/collections/audio/losses/__init__.py b/nemo/collections/audio/losses/__init__.py index 00db9e62bc33..341a77c5bc66 100644 --- a/nemo/collections/audio/losses/__init__.py +++ b/nemo/collections/audio/losses/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from nemo.collections.audio.losses.audio import MAELoss, MSELoss, SDRLoss diff --git a/nemo/collections/audio/losses/maxine/losses_combined.py b/nemo/collections/audio/losses/maxine/losses_combined.py index 1ea46aaaca12..95d95fe64ae8 100644 --- a/nemo/collections/audio/losses/maxine/losses_combined.py +++ b/nemo/collections/audio/losses/maxine/losses_combined.py @@ -18,15 +18,8 @@ import torch -try: - from torchaudio.functional import resample - from torchaudio.transforms import MelSpectrogram - - HAVE_TORCHAUDIO = True -except ModuleNotFoundError: - HAVE_TORCHAUDIO = False - -from nemo.collections.asr.models import ASRModel +from nemo.collections.asr.models.asr_model import ASRModel +from nemo.collections.audio.parts.utils.transforms import MelSpectrogram, resample from nemo.core import Loss, Typing, typecheck from nemo.core.neural_types import LengthsType, LossType, NeuralType, VoidType from nemo.utils import logging @@ -94,12 +87,6 @@ def __init__( conformer_model=STT_EN_CONFORMER_CTC_SMALL_v1_6_0, epsilon=float(5.9604644775390625e-8), ): - if not HAVE_TORCHAUDIO: - logging.error('Could not import torchaudio. Some features might not work.') - - raise ModuleNotFoundError( - f"torchaudio is not installed but is necessary to instantiate a {self.__class__.__name__}" - ) super().__init__() self.sample_rate = sample_rate diff --git a/nemo/collections/audio/models/__init__.py b/nemo/collections/audio/models/__init__.py index 4e743cd8c82b..341a77c5bc66 100644 --- a/nemo/collections/audio/models/__init__.py +++ b/nemo/collections/audio/models/__init__.py @@ -11,12 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from nemo.collections.audio.models.audio_to_audio import AudioToAudioModel -from nemo.collections.audio.models.enhancement import ( - EncMaskDecAudioToAudioModel, - FlowMatchingAudioToAudioModel, - PredictiveAudioToAudioModel, - SchroedingerBridgeAudioToAudioModel, - ScoreBasedGenerativeAudioToAudioModel, -) diff --git a/nemo/collections/audio/parts/utils/resampling.py b/nemo/collections/audio/parts/utils/resampling.py deleted file mode 100644 index 8b82ccf04a72..000000000000 --- a/nemo/collections/audio/parts/utils/resampling.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# NOTE: The code below originates from torchaudio repository, version 2.6.0. -# It can be found under: https://github.com/pytorch/audio/tree/release/2.6 -# The modifications applied are mostly cosmetic. -# The inclusion of this code in NeMo allows us to avoid -# a dependency with a problematic build process. -# This code is licensed under the BSD 2-Clause License, -# included verbatim from the torchaudio repository below: -# -# BSD 2-Clause License -# -# Copyright (c) 2017 Facebook Inc. (Soumith Chintala), -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import math -from typing import Optional - -import torch - - -class Resample(torch.nn.Module): - r"""Resample a signal from one frequency to another. A resampling method can be given. - - .. devices:: CPU CUDA - - .. properties:: Autograd TorchScript - - Note: - If resampling on waveforms of higher precision than float32, there may be a small loss of precision - because the kernel is cached once as float32. If high precision resampling is important for your application, - the functional form will retain higher precision, but run slower because it does not cache the kernel. - Alternatively, you could rewrite a transform that caches a higher precision kernel. - - Args: - orig_freq (int, optional): The original frequency of the signal. (Default: ``16000``) - new_freq (int, optional): The desired frequency. (Default: ``16000``) - resampling_method (str, optional): The resampling method to use. - Options: [``sinc_interp_hann``, ``sinc_interp_kaiser``] (Default: ``"sinc_interp_hann"``) - lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper - but less efficient. (Default: ``6``) - rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist. - Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``) - beta (float or None, optional): The shape parameter used for kaiser window. - dtype (torch.device, optional): - Determnines the precision that resampling kernel is pre-computed and cached. If not provided, - kernel is computed with ``torch.float64`` then cached as ``torch.float32``. - If you need higher precision, provide ``torch.float64``, and the pre-computed kernel is computed and - cached as ``torch.float64``. If you use resample with lower precision, then instead of providing this - providing this argument, please use ``Resample.to(dtype)``, so that the kernel generation is still - carried out on ``torch.float64``. - - Example - >>> waveform, sample_rate = ... - >>> transform = transforms.Resample(sample_rate, sample_rate/10) - >>> waveform = transform(waveform) - """ - - def __init__( - self, - orig_freq: int = 16000, - new_freq: int = 16000, - resampling_method: str = "sinc_interp_hann", - lowpass_filter_width: int = 6, - rolloff: float = 0.99, - beta: Optional[float] = None, - *, - dtype: Optional[torch.dtype] = None, - ) -> None: - super().__init__() - - self.orig_freq = orig_freq - self.new_freq = new_freq - self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq)) - self.resampling_method = resampling_method - self.lowpass_filter_width = lowpass_filter_width - self.rolloff = rolloff - self.beta = beta - - if self.orig_freq != self.new_freq: - kernel, self.width = _get_sinc_resample_kernel( - self.orig_freq, - self.new_freq, - self.gcd, - self.lowpass_filter_width, - self.rolloff, - self.resampling_method, - beta, - dtype=dtype, - ) - self.register_buffer("kernel", kernel) - - def forward(self, waveform: torch.Tensor) -> torch.Tensor: - r""" - Args: - waveform (Tensor): Tensor of audio of dimension (..., time). - - Returns: - Tensor: Output signal of dimension (..., time). - """ - if self.orig_freq == self.new_freq: - return waveform - return _apply_sinc_resample_kernel(waveform, self.orig_freq, self.new_freq, self.gcd, self.kernel, self.width) - - -def resample( - waveform: torch.Tensor, - orig_freq: int, - new_freq: int, - lowpass_filter_width: int = 6, - rolloff: float = 0.99, - resampling_method: str = "sinc_interp_hann", - beta: Optional[float] = None, -) -> torch.Tensor: - r"""Resamples the waveform at the new frequency using bandlimited interpolation. :cite:`RESAMPLE`. - - .. devices:: CPU CUDA - - .. properties:: Autograd TorchScript - - Note: - ``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in - more efficient computation if resampling multiple waveforms with the same resampling parameters. - - Args: - waveform (Tensor): The input signal of dimension `(..., time)` - orig_freq (int): The original frequency of the signal - new_freq (int): The desired frequency - lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper - but less efficient. (Default: ``6``) - rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist. - Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``) - resampling_method (str, optional): The resampling method to use. - Options: [``"sinc_interp_hann"``, ``"sinc_interp_kaiser"``] (Default: ``"sinc_interp_hann"``) - beta (float or None, optional): The shape parameter used for kaiser window. - - Returns: - Tensor: The waveform at the new frequency of dimension `(..., time).` - """ - - if orig_freq <= 0.0 or new_freq <= 0.0: - raise ValueError("Original frequency and desired frequecy should be positive") - - if orig_freq == new_freq: - return waveform - - gcd = math.gcd(int(orig_freq), int(new_freq)) - - kernel, width = _get_sinc_resample_kernel( - orig_freq, - new_freq, - gcd, - lowpass_filter_width, - rolloff, - resampling_method, - beta, - waveform.device, - waveform.dtype, - ) - resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width) - return resampled - - -def _get_sinc_resample_kernel( - orig_freq: int, - new_freq: int, - gcd: int, - lowpass_filter_width: int = 6, - rolloff: float = 0.99, - resampling_method: str = "sinc_interp_hann", - beta: Optional[float] = None, - device: torch.device = "cpu", - dtype: Optional[torch.dtype] = None, -): - if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq): - raise Exception( - "Frequencies must be of integer type to ensure quality resampling computation. " - "To work around this, manually convert both frequencies to integer values " - "that maintain their resampling rate ratio before passing them into the function. " - "Example: To downsample a 44100 hz waveform by a factor of 8, use " - "`orig_freq=8` and `new_freq=1` instead of `orig_freq=44100` and `new_freq=5512.5`. " - "For more information, please refer to https://github.com/pytorch/audio/issues/1487." - ) - - if resampling_method not in ["sinc_interp_hann", "sinc_interp_kaiser"]: - raise ValueError("Invalid resampling method: {}".format(resampling_method)) - - orig_freq = int(orig_freq) // gcd - new_freq = int(new_freq) // gcd - - if lowpass_filter_width <= 0: - raise ValueError("Low pass filter width should be positive.") - base_freq = min(orig_freq, new_freq) - # This will perform antialiasing filtering by removing the highest frequencies. - # At first I thought I only needed this when downsampling, but when upsampling - # you will get edge artifacts without this, as the edge is equivalent to zero padding, - # which will add high freq artifacts. - base_freq *= rolloff - - # The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor) - # using the sinc interpolation formula: - # x(t) = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - t)) - # We can then sample the function x(t) with a different sample rate: - # y[j] = x(j / new_freq) - # or, - # y[j] = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - j / new_freq)) - - # We see here that y[j] is the convolution of x[i] with a specific filter, for which - # we take an FIR approximation, stopping when we see at least `lowpass_filter_width` zeros crossing. - # But y[j+1] is going to have a different set of weights and so on, until y[j + new_freq]. - # Indeed: - # y[j + new_freq] = sum_i x[i] sinc(pi * orig_freq * ((i / orig_freq - (j + new_freq) / new_freq)) - # = sum_i x[i] sinc(pi * orig_freq * ((i - orig_freq) / orig_freq - j / new_freq)) - # = sum_i x[i + orig_freq] sinc(pi * orig_freq * (i / orig_freq - j / new_freq)) - # so y[j+new_freq] uses the same filter as y[j], but on a shifted version of x by `orig_freq`. - # This will explain the F.conv1d after, with a stride of orig_freq. - width = math.ceil(lowpass_filter_width * orig_freq / base_freq) - # If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e., - # they will have a lot of almost zero values to the left or to the right... - # There is probably a way to evaluate those filters more efficiently, but this is kept for - # future work. - idx_dtype = dtype if dtype is not None else torch.float64 - - idx = torch.arange(-width, width + orig_freq, dtype=idx_dtype, device=device)[None, None] / orig_freq - - t = torch.arange(0, -new_freq, -1, dtype=dtype, device=device)[:, None, None] / new_freq + idx - t *= base_freq - t = t.clamp_(-lowpass_filter_width, lowpass_filter_width) - - # we do not use built in torch windows here as we need to evaluate the window - # at specific positions, not over a regular grid. - if resampling_method == "sinc_interp_hann": - window = torch.cos(t * math.pi / lowpass_filter_width / 2) ** 2 - else: - # sinc_interp_kaiser - if beta is None: - beta = 14.769656459379492 - beta_tensor = torch.tensor(float(beta)) - window = torch.i0(beta_tensor * torch.sqrt(1 - (t / lowpass_filter_width) ** 2)) / torch.i0(beta_tensor) - - t *= math.pi - - scale = base_freq / orig_freq - kernels = torch.where(t == 0, torch.tensor(1.0).to(t), t.sin() / t) - kernels *= window * scale - - if dtype is None: - kernels = kernels.to(dtype=torch.float32) - - return kernels, width - - -def _apply_sinc_resample_kernel( - waveform: torch.Tensor, - orig_freq: int, - new_freq: int, - gcd: int, - kernel: torch.Tensor, - width: int, -): - if not waveform.is_floating_point(): - raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.") - - orig_freq = int(orig_freq) // gcd - new_freq = int(new_freq) // gcd - - # pack batch - shape = waveform.size() - waveform = waveform.view(-1, shape[-1]) - - num_wavs, length = waveform.shape - waveform = torch.nn.functional.pad(waveform, (width, width + orig_freq)) - resampled = torch.nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq) - resampled = resampled.transpose(1, 2).reshape(num_wavs, -1) - target_length = torch.ceil(torch.as_tensor(new_freq * length / orig_freq)).long() - resampled = resampled[..., :target_length] - - # unpack batch - resampled = resampled.view(shape[:-1] + resampled.shape[-1:]) - return resampled diff --git a/nemo/collections/audio/parts/utils/transforms.py b/nemo/collections/audio/parts/utils/transforms.py new file mode 100644 index 000000000000..6f7f91479904 --- /dev/null +++ b/nemo/collections/audio/parts/utils/transforms.py @@ -0,0 +1,1105 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# NOTE: The code below originates from torchaudio repository, version 2.9. +# It can be found under: https://github.com/pytorch/audio/tree/release/2.9 +# The modifications applied are mostly cosmetic. +# The inclusion of this code in NeMo allows us to avoid +# a dependency with a problematic build process. +# This code is licensed under the BSD 2-Clause License, +# included verbatim from the torchaudio repository below: +# +# BSD 2-Clause License +# +# Copyright (c) 2017 Facebook Inc. (Soumith Chintala), +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import math +import warnings +from typing import Callable, Optional, Union + +import torch +from torch import Tensor + +__all__ = ["Spectrogram", "MelSpectrogram", "MFCC", "Resample"] + + +class Spectrogram(torch.nn.Module): + r"""Create a spectrogram from a audio signal. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) + win_length (int or None, optional): Window size. (Default: ``n_fft``) + hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``) + pad (int, optional): Two sided padding of signal. (Default: ``0``) + window_fn (Callable[..., Tensor], optional): A function to create a window tensor + that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``) + power (float or None, optional): Exponent for the magnitude spectrogram, + (must be > 0) e.g., 1 for magnitude, 2 for power, etc. + If None, then the complex spectrum is returned instead. (Default: ``2``) + normalized (bool or str, optional): Whether to normalize by magnitude after stft. If input is str, choices are + ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to + ``"window"``. (Default: ``False``) + wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``) + center (bool, optional): whether to pad :attr:`waveform` on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. + (Default: ``True``) + pad_mode (string, optional): controls the padding method used when + :attr:`center` is ``True``. (Default: ``"reflect"``) + onesided (bool, optional): controls whether to return half of results to + avoid redundancy (Default: ``True``) + return_complex (bool, optional): + Deprecated and not used. + + Example + >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> transform = torchaudio.transforms.Spectrogram(n_fft=800) + >>> spectrogram = transform(waveform) + + """ + + __constants__ = ["n_fft", "win_length", "hop_length", "pad", "power", "normalized"] + + def __init__( + self, + n_fft: int = 400, + win_length: Optional[int] = None, + hop_length: Optional[int] = None, + pad: int = 0, + window_fn: Callable[..., Tensor] = torch.hann_window, + power: Optional[float] = 2.0, + normalized: Union[bool, str] = False, + wkwargs: Optional[dict] = None, + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, + return_complex: Optional[bool] = None, + ) -> None: + super().__init__() + self.n_fft = n_fft + # number of FFT bins. the returned STFT result will have n_fft // 2 + 1 + # number of frequencies due to onesided=True in torch.stft + self.win_length = win_length if win_length is not None else n_fft + self.hop_length = hop_length if hop_length is not None else self.win_length // 2 + window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs) + self.register_buffer("window", window) + self.pad = pad + self.power = power + self.normalized = normalized + self.center = center + self.pad_mode = pad_mode + self.onesided = onesided + if return_complex is not None: + warnings.warn( + "`return_complex` argument is now deprecated and is not effective." + "`torchaudio.transforms.Spectrogram(power=None)` always returns a tensor with " + "complex dtype. Please remove the argument in the function call." + ) + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension (..., time). + + Returns: + Tensor: Dimension (..., freq, time), where freq is + ``n_fft // 2 + 1`` where ``n_fft`` is the number of + Fourier bins, and time is the number of window hops (n_frame). + """ + return spectrogram( + waveform, + self.pad, + self.window, + self.n_fft, + self.hop_length, + self.win_length, + self.power, + self.normalized, + self.center, + self.pad_mode, + self.onesided, + ) + + +class MelSpectrogram(torch.nn.Module): + r"""Create MelSpectrogram for a raw audio signal. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + This is a composition of :py:func:`torchaudio.transforms.Spectrogram` + and :py:func:`torchaudio.transforms.MelScale`. + + Sources + * https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe + * https://timsainb.github.io/spectrograms-mfccs-and-inversion-in-python.html + * http://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html + + Args: + sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``) + n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) + win_length (int or None, optional): Window size. (Default: ``n_fft``) + hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``) + f_min (float, optional): Minimum frequency. (Default: ``0.``) + f_max (float or None, optional): Maximum frequency. (Default: ``None``) + pad (int, optional): Two sided padding of signal. (Default: ``0``) + n_mels (int, optional): Number of mel filterbanks. (Default: ``128``) + window_fn (Callable[..., Tensor], optional): A function to create a window tensor + that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``) + power (float, optional): Exponent for the magnitude spectrogram, + (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``) + normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``) + wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``) + center (bool, optional): whether to pad :attr:`waveform` on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. + (Default: ``True``) + pad_mode (string, optional): controls the padding method used when + :attr:`center` is ``True``. (Default: ``"reflect"``) + onesided: Deprecated and unused. + norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band + (area normalization). (Default: ``None``) + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + + Example + >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> transform = transforms.MelSpectrogram(sample_rate) + >>> mel_specgram = transform(waveform) # (channel, n_mels, time) + + See also: + :py:func:`torchaudio.functional.melscale_fbanks` - The function used to + generate the filter banks. + """ + + __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_mels", "f_min"] + + def __init__( + self, + sample_rate: int = 16000, + n_fft: int = 400, + win_length: Optional[int] = None, + hop_length: Optional[int] = None, + f_min: float = 0.0, + f_max: Optional[float] = None, + pad: int = 0, + n_mels: int = 128, + window_fn: Callable[..., Tensor] = torch.hann_window, + power: float = 2.0, + normalized: bool = False, + wkwargs: Optional[dict] = None, + center: bool = True, + pad_mode: str = "reflect", + onesided: Optional[bool] = None, + norm: Optional[str] = None, + mel_scale: str = "htk", + ) -> None: + super(MelSpectrogram, self).__init__() + + if onesided is not None: + warnings.warn( + "Argument 'onesided' has been deprecated and has no influence on the behavior of this module." + ) + + self.sample_rate = sample_rate + self.n_fft = n_fft + self.win_length = win_length if win_length is not None else n_fft + self.hop_length = hop_length if hop_length is not None else self.win_length // 2 + self.pad = pad + self.power = power + self.normalized = normalized + self.n_mels = n_mels # number of mel frequency bins + self.f_max = f_max + self.f_min = f_min + self.spectrogram = Spectrogram( + n_fft=self.n_fft, + win_length=self.win_length, + hop_length=self.hop_length, + pad=self.pad, + window_fn=window_fn, + power=self.power, + normalized=self.normalized, + wkwargs=wkwargs, + center=center, + pad_mode=pad_mode, + onesided=True, + ) + self.mel_scale = MelScale( + self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, norm, mel_scale + ) + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension (..., time). + + Returns: + Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time). + """ + specgram = self.spectrogram(waveform) + mel_specgram = self.mel_scale(specgram) + return mel_specgram + + +class MFCC(torch.nn.Module): + r"""Create the Mel-frequency cepstrum coefficients from an audio signal. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + By default, this calculates the MFCC on the DB-scaled Mel spectrogram. + This is not the textbook implementation, but is implemented here to + give consistency with librosa. + + This output depends on the maximum value in the input spectrogram, and so + may return different values for an audio clip split into snippets vs. a + a full clip. + + Args: + sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``) + n_mfcc (int, optional): Number of mfc coefficients to retain. (Default: ``40``) + dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``) + norm (str, optional): norm to use. (Default: ``"ortho"``) + log_mels (bool, optional): whether to use log-mel spectrograms instead of db-scaled. (Default: ``False``) + melkwargs (dict or None, optional): arguments for MelSpectrogram. (Default: ``None``) + + Example + >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> transform = transforms.MFCC( + >>> sample_rate=sample_rate, + >>> n_mfcc=13, + >>> melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23, "center": False}, + >>> ) + >>> mfcc = transform(waveform) + + See also: + :py:func:`torchaudio.functional.melscale_fbanks` - The function used to + generate the filter banks. + """ + + __constants__ = ["sample_rate", "n_mfcc", "dct_type", "top_db", "log_mels"] + + def __init__( + self, + sample_rate: int = 16000, + n_mfcc: int = 40, + dct_type: int = 2, + norm: str = "ortho", + log_mels: bool = False, + melkwargs: Optional[dict] = None, + ) -> None: + super(MFCC, self).__init__() + supported_dct_types = [2] + if dct_type not in supported_dct_types: + raise ValueError("DCT type not supported: {}".format(dct_type)) + self.sample_rate = sample_rate + self.n_mfcc = n_mfcc + self.dct_type = dct_type + self.norm = norm + self.top_db = 80.0 + self.amplitude_to_DB = AmplitudeToDB("power", self.top_db) + + melkwargs = melkwargs or {} + self.MelSpectrogram = MelSpectrogram(sample_rate=self.sample_rate, **melkwargs) + + if self.n_mfcc > self.MelSpectrogram.n_mels: + raise ValueError("Cannot select more MFCC coefficients than # mel bins") + dct_mat = create_dct(self.n_mfcc, self.MelSpectrogram.n_mels, self.norm) + self.register_buffer("dct_mat", dct_mat) + self.log_mels = log_mels + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension (..., time). + + Returns: + Tensor: specgram_mel_db of size (..., ``n_mfcc``, time). + """ + mel_specgram = self.MelSpectrogram(waveform) + if self.log_mels: + log_offset = 1e-6 + mel_specgram = torch.log(mel_specgram + log_offset) + else: + mel_specgram = self.amplitude_to_DB(mel_specgram) + + # (..., time, n_mels) dot (n_mels, n_mfcc) -> (..., n_nfcc, time) + mfcc = torch.matmul(mel_specgram.transpose(-1, -2), self.dct_mat).transpose(-1, -2) + return mfcc + + +class Resample(torch.nn.Module): + r"""Resample a signal from one frequency to another. A resampling method can be given. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Note: + If resampling on waveforms of higher precision than float32, there may be a small loss of precision + because the kernel is cached once as float32. If high precision resampling is important for your application, + the functional form will retain higher precision, but run slower because it does not cache the kernel. + Alternatively, you could rewrite a transform that caches a higher precision kernel. + + Args: + orig_freq (int, optional): The original frequency of the signal. (Default: ``16000``) + new_freq (int, optional): The desired frequency. (Default: ``16000``) + resampling_method (str, optional): The resampling method to use. + Options: [``sinc_interp_hann``, ``sinc_interp_kaiser``] (Default: ``"sinc_interp_hann"``) + lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper + but less efficient. (Default: ``6``) + rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist. + Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``) + beta (float or None, optional): The shape parameter used for kaiser window. + dtype (torch.device, optional): + Determnines the precision that resampling kernel is pre-computed and cached. If not provided, + kernel is computed with ``torch.float64`` then cached as ``torch.float32``. + If you need higher precision, provide ``torch.float64``, and the pre-computed kernel is computed and + cached as ``torch.float64``. If you use resample with lower precision, then instead of providing this + providing this argument, please use ``Resample.to(dtype)``, so that the kernel generation is still + carried out on ``torch.float64``. + + Example + >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> transform = transforms.Resample(sample_rate, sample_rate/10) + >>> waveform = transform(waveform) + """ + + def __init__( + self, + orig_freq: int = 16000, + new_freq: int = 16000, + resampling_method: str = "sinc_interp_hann", + lowpass_filter_width: int = 6, + rolloff: float = 0.99, + beta: Optional[float] = None, + *, + dtype: Optional[torch.dtype] = None, + ) -> None: + super().__init__() + + self.orig_freq = orig_freq + self.new_freq = new_freq + self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq)) + self.resampling_method = resampling_method + self.lowpass_filter_width = lowpass_filter_width + self.rolloff = rolloff + self.beta = beta + + if self.orig_freq != self.new_freq: + kernel, self.width = _get_sinc_resample_kernel( + self.orig_freq, + self.new_freq, + self.gcd, + self.lowpass_filter_width, + self.rolloff, + self.resampling_method, + beta, + dtype=dtype, + ) + self.register_buffer("kernel", kernel) + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension (..., time). + + Returns: + Tensor: Output signal of dimension (..., time). + """ + if self.orig_freq == self.new_freq: + return waveform + return _apply_sinc_resample_kernel(waveform, self.orig_freq, self.new_freq, self.gcd, self.kernel, self.width) + + +class MelScale(torch.nn.Module): + r"""Turn a normal STFT into a mel frequency STFT with triangular filter banks. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + n_mels (int, optional): Number of mel filterbanks. (Default: ``128``) + sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``) + f_min (float, optional): Minimum frequency. (Default: ``0.``) + f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``) + n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``) + norm (str or None, optional): If ``"slaney"``, divide the triangular mel weights by the width of the mel band + (area normalization). (Default: ``None``) + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + + Example + >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024) + >>> spectrogram = spectrogram_transform(waveform) + >>> melscale_transform = transforms.MelScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1) + >>> melscale_spectrogram = melscale_transform(spectrogram) + + See also: + :py:func:`torchaudio.functional.melscale_fbanks` - The function used to + generate the filter banks. + """ + + __constants__ = ["n_mels", "sample_rate", "f_min", "f_max"] + + def __init__( + self, + n_mels: int = 128, + sample_rate: int = 16000, + f_min: float = 0.0, + f_max: Optional[float] = None, + n_stft: int = 201, + norm: Optional[str] = None, + mel_scale: str = "htk", + ) -> None: + super(MelScale, self).__init__() + self.n_mels = n_mels + self.sample_rate = sample_rate + self.f_max = f_max if f_max is not None else float(sample_rate // 2) + self.f_min = f_min + self.norm = norm + self.mel_scale = mel_scale + + if f_min > self.f_max: + raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max)) + + fb = melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, self.norm, self.mel_scale) + self.register_buffer("fb", fb) + + def forward(self, specgram: Tensor) -> Tensor: + r""" + Args: + specgram (Tensor): A spectrogram STFT of dimension (..., freq, time). + + Returns: + Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time). + """ + + # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time) + mel_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2) + + return mel_specgram + + +class AmplitudeToDB(torch.nn.Module): + r"""Turn a tensor from the power/amplitude scale to the decibel scale. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + This output depends on the maximum value in the input tensor, and so + may return different values for an audio clip split into snippets vs. a + a full clip. + + Args: + stype (str, optional): scale of input tensor (``"power"`` or ``"magnitude"``). The + power being the elementwise square of the magnitude. (Default: ``"power"``) + top_db (float or None, optional): minimum negative cut-off in decibels. A reasonable + number is 80. (Default: ``None``) + + Example + >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> transform = transforms.AmplitudeToDB(stype="amplitude", top_db=80) + >>> waveform_db = transform(waveform) + """ + + __constants__ = ["multiplier", "amin", "ref_value", "db_multiplier"] + + def __init__(self, stype: str = "power", top_db: Optional[float] = None) -> None: + super(AmplitudeToDB, self).__init__() + self.stype = stype + if top_db is not None and top_db < 0: + raise ValueError("top_db must be positive value") + self.top_db = top_db + self.multiplier = 10.0 if stype == "power" else 20.0 + self.amin = 1e-10 + self.ref_value = 1.0 + self.db_multiplier = math.log10(max(self.amin, self.ref_value)) + + def forward(self, x: Tensor) -> Tensor: + r"""Numerically stable implementation from Librosa. + + https://librosa.org/doc/latest/generated/librosa.amplitude_to_db.html + + Args: + x (Tensor): Input tensor before being converted to decibel scale. + + Returns: + Tensor: Output tensor in decibel scale. + """ + return amplitude_to_DB(x, self.multiplier, self.amin, self.db_multiplier, self.top_db) + + +def resample( + waveform: Tensor, + orig_freq: int, + new_freq: int, + lowpass_filter_width: int = 6, + rolloff: float = 0.99, + resampling_method: str = "sinc_interp_hann", + beta: Optional[float] = None, +) -> Tensor: + r"""Resamples the waveform at the new frequency using bandlimited interpolation. :cite:`RESAMPLE`. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Note: + ``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in + more efficient computation if resampling multiple waveforms with the same resampling parameters. + + Args: + waveform (Tensor): The input signal of dimension `(..., time)` + orig_freq (int): The original frequency of the signal + new_freq (int): The desired frequency + lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper + but less efficient. (Default: ``6``) + rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist. + Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``) + resampling_method (str, optional): The resampling method to use. + Options: [``"sinc_interp_hann"``, ``"sinc_interp_kaiser"``] (Default: ``"sinc_interp_hann"``) + beta (float or None, optional): The shape parameter used for kaiser window. + + Returns: + Tensor: The waveform at the new frequency of dimension `(..., time).` + """ + + if orig_freq <= 0.0 or new_freq <= 0.0: + raise ValueError("Original frequency and desired frequecy should be positive") + + if orig_freq == new_freq: + return waveform + + gcd = math.gcd(int(orig_freq), int(new_freq)) + + kernel, width = _get_sinc_resample_kernel( + orig_freq, + new_freq, + gcd, + lowpass_filter_width, + rolloff, + resampling_method, + beta, + waveform.device, + waveform.dtype, + ) + resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width) + return resampled + + +def _get_sinc_resample_kernel( + orig_freq: int, + new_freq: int, + gcd: int, + lowpass_filter_width: int = 6, + rolloff: float = 0.99, + resampling_method: str = "sinc_interp_hann", + beta: Optional[float] = None, + device: torch.device = "cpu", + dtype: Optional[torch.dtype] = None, +): + if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq): + raise Exception( + "Frequencies must be of integer type to ensure quality resampling computation. " + "To work around this, manually convert both frequencies to integer values " + "that maintain their resampling rate ratio before passing them into the function. " + "Example: To downsample a 44100 hz waveform by a factor of 8, use " + "`orig_freq=8` and `new_freq=1` instead of `orig_freq=44100` and `new_freq=5512.5`. " + "For more information, please refer to https://github.com/pytorch/audio/issues/1487." + ) + + if resampling_method not in ["sinc_interp_hann", "sinc_interp_kaiser"]: + raise ValueError("Invalid resampling method: {}".format(resampling_method)) + + orig_freq = int(orig_freq) // gcd + new_freq = int(new_freq) // gcd + + if lowpass_filter_width <= 0: + raise ValueError("Low pass filter width should be positive.") + base_freq = min(orig_freq, new_freq) + # This will perform antialiasing filtering by removing the highest frequencies. + # At first I thought I only needed this when downsampling, but when upsampling + # you will get edge artifacts without this, as the edge is equivalent to zero padding, + # which will add high freq artifacts. + base_freq *= rolloff + + # The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor) + # using the sinc interpolation formula: + # x(t) = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - t)) + # We can then sample the function x(t) with a different sample rate: + # y[j] = x(j / new_freq) + # or, + # y[j] = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - j / new_freq)) + + # We see here that y[j] is the convolution of x[i] with a specific filter, for which + # we take an FIR approximation, stopping when we see at least `lowpass_filter_width` zeros crossing. + # But y[j+1] is going to have a different set of weights and so on, until y[j + new_freq]. + # Indeed: + # y[j + new_freq] = sum_i x[i] sinc(pi * orig_freq * ((i / orig_freq - (j + new_freq) / new_freq)) + # = sum_i x[i] sinc(pi * orig_freq * ((i - orig_freq) / orig_freq - j / new_freq)) + # = sum_i x[i + orig_freq] sinc(pi * orig_freq * (i / orig_freq - j / new_freq)) + # so y[j+new_freq] uses the same filter as y[j], but on a shifted version of x by `orig_freq`. + # This will explain the F.conv1d after, with a stride of orig_freq. + width = math.ceil(lowpass_filter_width * orig_freq / base_freq) + # If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e., + # they will have a lot of almost zero values to the left or to the right... + # There is probably a way to evaluate those filters more efficiently, but this is kept for + # future work. + idx_dtype = dtype if dtype is not None else torch.float64 + + idx = torch.arange(-width, width + orig_freq, dtype=idx_dtype, device=device)[None, None] / orig_freq + + t = torch.arange(0, -new_freq, -1, dtype=dtype, device=device)[:, None, None] / new_freq + idx + t *= base_freq + t = t.clamp_(-lowpass_filter_width, lowpass_filter_width) + + # we do not use built in torch windows here as we need to evaluate the window + # at specific positions, not over a regular grid. + if resampling_method == "sinc_interp_hann": + window = torch.cos(t * math.pi / lowpass_filter_width / 2) ** 2 + else: + # sinc_interp_kaiser + if beta is None: + beta = 14.769656459379492 + beta_tensor = torch.tensor(float(beta)) + window = torch.i0(beta_tensor * torch.sqrt(1 - (t / lowpass_filter_width) ** 2)) / torch.i0(beta_tensor) + + t *= math.pi + + scale = base_freq / orig_freq + kernels = torch.where(t == 0, torch.tensor(1.0).to(t), t.sin() / t) + kernels *= window * scale + + if dtype is None: + kernels = kernels.to(dtype=torch.float32) + + return kernels, width + + +def _apply_sinc_resample_kernel( + waveform: Tensor, + orig_freq: int, + new_freq: int, + gcd: int, + kernel: Tensor, + width: int, +): + if not waveform.is_floating_point(): + raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.") + + orig_freq = int(orig_freq) // gcd + new_freq = int(new_freq) // gcd + + # pack batch + shape = waveform.size() + waveform = waveform.view(-1, shape[-1]) + + num_wavs, length = waveform.shape + waveform = torch.nn.functional.pad(waveform, (width, width + orig_freq)) + resampled = torch.nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq) + resampled = resampled.transpose(1, 2).reshape(num_wavs, -1) + target_length = torch.ceil(torch.as_tensor(new_freq * length / orig_freq)).long() + resampled = resampled[..., :target_length] + + # unpack batch + resampled = resampled.view(shape[:-1] + resampled.shape[-1:]) + return resampled + + +def spectrogram( + waveform: Tensor, + pad: int, + window: Tensor, + n_fft: int, + hop_length: int, + win_length: int, + power: Optional[float], + normalized: Union[bool, str], + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, + return_complex: Optional[bool] = None, +) -> Tensor: + r"""Create a spectrogram or a batch of spectrograms from a raw audio signal. + The spectrogram can be either magnitude-only or complex. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): Tensor of audio of dimension `(..., time)` + pad (int): Two sided padding of signal + window (Tensor): Window tensor that is applied/multiplied to each frame/window + n_fft (int): Size of FFT + hop_length (int): Length of hop between STFT windows + win_length (int): Window size + power (float or None): Exponent for the magnitude spectrogram, + (must be > 0) e.g., 1 for magnitude, 2 for power, etc. + If None, then the complex spectrum is returned instead. + normalized (bool or str): Whether to normalize by magnitude after stft. If input is str, choices are + ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to + ``"window"``. When normalized on ``"window"``, waveform is normalized upon the window's L2 energy. If + normalized on ``"frame_length"``, waveform is normalized by dividing by + :math:`(\text{frame\_length})^{0.5}`. + center (bool, optional): whether to pad :attr:`waveform` on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. + Default: ``True`` + pad_mode (string, optional): controls the padding method used when + :attr:`center` is ``True``. Default: ``"reflect"`` + onesided (bool, optional): controls whether to return half of results to + avoid redundancy. Default: ``True`` + return_complex (bool, optional): + Deprecated and not used. + + Returns: + Tensor: Dimension `(..., freq, time)`, freq is + ``n_fft // 2 + 1`` and ``n_fft`` is the number of + Fourier bins, and time is the number of window hops (n_frame). + """ + if return_complex is not None: + warnings.warn( + "`return_complex` argument is now deprecated and is not effective." + "`torchaudio.functional.spectrogram(power=None)` always returns a tensor with " + "complex dtype. Please remove the argument in the function call." + ) + + if pad > 0: + # TODO add "with torch.no_grad():" back when JIT supports it + waveform = torch.nn.functional.pad(waveform, (pad, pad), "constant") + + frame_length_norm, window_norm = _get_spec_norms(normalized) + + # pack batch + shape = waveform.size() + waveform = waveform.reshape(-1, shape[-1]) + + # default values are consistent with librosa.core.spectrum._spectrogram + spec_f = torch.stft( + input=waveform, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + normalized=frame_length_norm, + onesided=onesided, + return_complex=True, + ) + + # unpack batch + spec_f = spec_f.reshape(shape[:-1] + spec_f.shape[-2:]) + + if window_norm: + spec_f /= window.pow(2.0).sum().sqrt() + if power is not None: + if power == 1.0: + return spec_f.abs() + return spec_f.abs().pow(power) + return spec_f + + +def _get_spec_norms(normalized: Union[str, bool]): + frame_length_norm, window_norm = False, False + if torch.jit.isinstance(normalized, str): + if normalized not in ["frame_length", "window"]: + raise ValueError("Invalid normalized parameter: {}".format(normalized)) + if normalized == "frame_length": + frame_length_norm = True + elif normalized == "window": + window_norm = True + elif torch.jit.isinstance(normalized, bool): + if normalized: + window_norm = True + else: + raise TypeError("Input type not supported") + return frame_length_norm, window_norm + + +def amplitude_to_DB( + x: Tensor, multiplier: float, amin: float, db_multiplier: float, top_db: Optional[float] = None +) -> Tensor: + r"""Turn a spectrogram from the power/amplitude scale to the decibel scale. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + The output of each tensor in a batch depends on the maximum value of that tensor, + and so may return different values for an audio clip split into snippets vs. a full clip. + + Args: + + x (Tensor): Input spectrogram(s) before being converted to decibel scale. + The expected shapes are ``(freq, time)``, ``(channel, freq, time)`` or + ``(..., batch, channel, freq, time)``. + + .. note:: + + When ``top_db`` is specified, cut-off values are computed for each audio + in the batch. Therefore if the input shape is 4D (or larger), different + cut-off values are used for audio data in the batch. + If the input shape is 2D or 3D, a single cutoff value is used. + + multiplier (float): Use 10. for power and 20. for amplitude + amin (float): Number to clamp ``x`` + db_multiplier (float): Log10(max(reference value and amin)) + top_db (float or None, optional): Minimum negative cut-off in decibels. A reasonable number + is 80. (Default: ``None``) + + Returns: + Tensor: Output tensor in decibel scale + """ + x_db = multiplier * torch.log10(torch.clamp(x, min=amin)) + x_db -= multiplier * db_multiplier + + if top_db is not None: + # Expand batch + shape = x_db.size() + packed_channels = shape[-3] if x_db.dim() > 2 else 1 + x_db = x_db.reshape(-1, packed_channels, shape[-2], shape[-1]) + + x_db = torch.max(x_db, (x_db.amax(dim=(-3, -2, -1)) - top_db).view(-1, 1, 1, 1)) + + # Repack batch + x_db = x_db.reshape(shape) + + return x_db + + +def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor: + r"""Create a DCT transformation matrix with shape (``n_mels``, ``n_mfcc``), + normalized depending on norm. + + .. devices:: CPU + + .. properties:: TorchScript + + Args: + n_mfcc (int): Number of mfc coefficients to retain + n_mels (int): Number of mel filterbanks + norm (str or None): Norm to use (either "ortho" or None) + + Returns: + Tensor: The transformation matrix, to be right-multiplied to + row-wise data of size (``n_mels``, ``n_mfcc``). + """ + + if norm is not None and norm != "ortho": + raise ValueError('norm must be either "ortho" or None') + + # http://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II + n = torch.arange(float(n_mels)) + k = torch.arange(float(n_mfcc)).unsqueeze(1) + dct = torch.cos(math.pi / float(n_mels) * (n + 0.5) * k) # size (n_mfcc, n_mels) + + if norm is None: + dct *= 2.0 + else: + dct[0] *= 1.0 / math.sqrt(2.0) + dct *= math.sqrt(2.0 / float(n_mels)) + return dct.t() + + +def melscale_fbanks( + n_freqs: int, + f_min: float, + f_max: float, + n_mels: int, + sample_rate: int, + norm: Optional[str] = None, + mel_scale: str = "htk", +) -> Tensor: + r"""Create a frequency bin conversion matrix. + + .. devices:: CPU + + .. properties:: TorchScript + + Note: + For the sake of the numerical compatibility with librosa, not all the coefficients + in the resulting filter bank has magnitude of 1. + + .. image:: https://download.pytorch.org/torchaudio/doc-assets/mel_fbanks.png + :alt: Visualization of generated filter bank + + Args: + n_freqs (int): Number of frequencies to highlight/apply + f_min (float): Minimum frequency (Hz) + f_max (float): Maximum frequency (Hz) + n_mels (int): Number of mel filterbanks + sample_rate (int): Sample rate of the audio waveform + norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band + (area normalization). (Default: ``None``) + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + + Returns: + Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``) + meaning number of frequencies to highlight/apply to x the number of filterbanks. + Each column is a filterbank so that assuming there is a matrix A of + size (..., ``n_freqs``), the applied result would be + ``A @ melscale_fbanks(A.size(-1), ...)``. + + """ + + if norm is not None and norm != "slaney": + raise ValueError('norm must be one of None or "slaney"') + + # freq bins + all_freqs = torch.linspace(0, sample_rate // 2, n_freqs) + + # calculate mel freq bins + m_min = _hz_to_mel(f_min, mel_scale=mel_scale) + m_max = _hz_to_mel(f_max, mel_scale=mel_scale) + + m_pts = torch.linspace(m_min, m_max, n_mels + 2) + f_pts = _mel_to_hz(m_pts, mel_scale=mel_scale) + + # create filterbank + fb = _create_triangular_filterbank(all_freqs, f_pts) + + if norm is not None and norm == "slaney": + # Slaney-style mel is scaled to be approx constant energy per channel + enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels]) + fb *= enorm.unsqueeze(0) + + if (fb.max(dim=0).values == 0.0).any(): + warnings.warn( + "At least one mel filterbank has all zero values. " + f"The value for `n_mels` ({n_mels}) may be set too high. " + f"Or, the value for `n_freqs` ({n_freqs}) may be set too low." + ) + + return fb + + +def _hz_to_mel(freq: float, mel_scale: str = "htk") -> float: + r"""Convert Hz to Mels. + + Args: + freqs (float): Frequencies in Hz + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + + Returns: + mels (float): Frequency in Mels + """ + + if mel_scale not in ["slaney", "htk"]: + raise ValueError('mel_scale should be one of "htk" or "slaney".') + + if mel_scale == "htk": + return 2595.0 * math.log10(1.0 + (freq / 700.0)) + + # Fill in the linear part + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (freq - f_min) / f_sp + + # Fill in the log-scale part + min_log_hz = 1000.0 + min_log_mel = (min_log_hz - f_min) / f_sp + logstep = math.log(6.4) / 27.0 + + if freq >= min_log_hz: + mels = min_log_mel + math.log(freq / min_log_hz) / logstep + + return mels + + +def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor: + """Convert mel bin numbers to frequencies. + + Args: + mels (Tensor): Mel frequencies + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + + Returns: + freqs (Tensor): Mels converted in Hz + """ + + if mel_scale not in ["slaney", "htk"]: + raise ValueError('mel_scale should be one of "htk" or "slaney".') + + if mel_scale == "htk": + return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) + + # Fill in the linear scale + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mels + + # And now the nonlinear scale + min_log_hz = 1000.0 + min_log_mel = (min_log_hz - f_min) / f_sp + logstep = math.log(6.4) / 27.0 + + log_t = mels >= min_log_mel + freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel)) + + return freqs + + +def _create_triangular_filterbank( + all_freqs: Tensor, + f_pts: Tensor, +) -> Tensor: + """Create a triangular filter bank. + + Args: + all_freqs (Tensor): STFT freq points of size (`n_freqs`). + f_pts (Tensor): Filter mid points of size (`n_filter`). + + Returns: + fb (Tensor): The filter bank of size (`n_freqs`, `n_filter`). + """ + # Adopted from Librosa + # calculate the difference between each filter mid point and each stft freq point in hertz + f_diff = f_pts[1:] - f_pts[:-1] # (n_filter + 1) + slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1) # (n_freqs, n_filter + 2) + # create overlapping triangles + zero = torch.zeros(1) + down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1] # (n_freqs, n_filter) + up_slopes = slopes[:, 2:] / f_diff[1:] # (n_freqs, n_filter) + fb = torch.max(zero, torch.min(down_slopes, up_slopes)) + + return fb diff --git a/nemo/collections/speechlm2/models/duplex_s2s_model.py b/nemo/collections/speechlm2/models/duplex_s2s_model.py index 2de158d88be9..f3c865f63a45 100644 --- a/nemo/collections/speechlm2/models/duplex_s2s_model.py +++ b/nemo/collections/speechlm2/models/duplex_s2s_model.py @@ -30,7 +30,7 @@ ) from transformers import DynamicCache -from nemo.collections.audio.parts.utils.resampling import resample +from nemo.collections.audio.parts.utils.transforms import resample from nemo.collections.common.tokenizers import AutoTokenizer from nemo.collections.speechlm2.data.utils import get_pad_id from nemo.collections.speechlm2.parts.hf_hub import HFHubMixin diff --git a/nemo/collections/speechlm2/models/duplex_s2s_speech_decoder_model.py b/nemo/collections/speechlm2/models/duplex_s2s_speech_decoder_model.py index 3605e886b3e4..7724b4d0e01d 100644 --- a/nemo/collections/speechlm2/models/duplex_s2s_speech_decoder_model.py +++ b/nemo/collections/speechlm2/models/duplex_s2s_speech_decoder_model.py @@ -29,7 +29,7 @@ ) from transformers import DynamicCache -from nemo.collections.audio.parts.utils.resampling import resample +from nemo.collections.audio.parts.utils.transforms import resample from nemo.collections.common.tokenizers import AutoTokenizer from nemo.collections.speechlm2.data.utils import get_pad_id from nemo.collections.speechlm2.models.duplex_s2s_model import replace_control_speech_codes, tokens_to_str diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py index f37818e43d89..2cdd5f0f8c9c 100644 --- a/nemo/collections/tts/models/audio_codec.py +++ b/nemo/collections/tts/models/audio_codec.py @@ -13,6 +13,7 @@ # limitations under the License. import itertools +from contextlib import nullcontext from math import ceil from pathlib import Path from typing import List, Tuple @@ -24,6 +25,7 @@ from lightning.pytorch import Trainer from omegaconf import DictConfig, OmegaConf, open_dict +from nemo.collections.audio.parts.utils.transforms import Resample, resample from nemo.collections.common.parts.utils import mask_sequence_tensor from nemo.collections.tts.losses.audio_codec_loss import ( FeatureMatchingLoss, @@ -51,13 +53,6 @@ from nemo.core.optim.lr_scheduler import compute_max_steps, prepare_lr_scheduler from nemo.utils import logging, model_utils -try: - import torchaudio - - HAVE_TORCHAUDIO = True -except ModuleNotFoundError: - HAVE_TORCHAUDIO = False - class AudioCodecModel(ModelPT): def __init__(self, cfg: DictConfig, trainer: Trainer = None): @@ -192,7 +187,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): ) # freeze the pretrained speaker encoder self.speaker_encoder.freeze() - print("Speaker encoder loaded and frozen !!") + logging.info("Speaker encoder loaded and frozen !!") + self.speaker_encoder_resampler = Resample( + orig_freq=self.sample_rate, new_freq=self.speaker_encoder.audio_config["sample_rate"] + ) # Disabled for now as it is not used in final model self.use_asr_consitency_loss = False @@ -254,24 +252,9 @@ def load_state_dict(self, state_dict, strict=True): super().load_state_dict(state_dict, strict=False) def get_speaker_embedding(self, audio, requires_grad=False): - if not requires_grad: - with torch.no_grad(): - if HAVE_TORCHAUDIO: - audio_resampled = torchaudio.functional.resample( - audio, self.sample_rate, self.speaker_encoder.audio_config["sample_rate"] - ) - else: - logging.error('Could not import torchaudio!') - raise ModuleNotFoundError("torchaudio is not installed but is necessary to audio resample !!") - g = self.speaker_encoder(audio_resampled, l2_norm=True).unsqueeze(-1) - else: - if HAVE_TORCHAUDIO: - audio_resampled = torchaudio.functional.resample( - audio, self.sample_rate, self.speaker_encoder.audio_config["sample_rate"] - ) - else: - logging.error('Could not import torchaudio!') - raise ModuleNotFoundError("torchaudio is not installed but is necessary to audio resample !!") + grad_context = nullcontext() if requires_grad else torch.no_grad() + with grad_context: + audio_resampled = self.speaker_encoder_resampler(audio) g = self.speaker_encoder(audio_resampled, l2_norm=True).unsqueeze(-1) return g @@ -506,10 +489,7 @@ def pad_audio(self, audio, audio_len, samples_per_frame): def preprocess_audio(self, audio, audio_len, sample_rate): if sample_rate and sample_rate != self.sample_rate: - if not HAVE_TORCHAUDIO: - raise ModuleNotFoundError("Must install torchaudio for resampling.") - - audio = torchaudio.functional.resample(waveform=audio, orig_freq=sample_rate, new_freq=self.sample_rate) + audio = resample(waveform=audio, orig_freq=sample_rate, new_freq=self.sample_rate) audio_len_scaled = audio_len.long() * self.sample_rate new_audio_len = audio_len_scaled / sample_rate # To avoid rounding issues at lower precisions, do not call torch.ceil when the length is divisible by the sample rate diff --git a/nemo/collections/tts/modules/audio_codec_modules.py b/nemo/collections/tts/modules/audio_codec_modules.py index 8ce522d798ec..3a0506f06d2c 100755 --- a/nemo/collections/tts/modules/audio_codec_modules.py +++ b/nemo/collections/tts/modules/audio_codec_modules.py @@ -25,6 +25,7 @@ from transformers import AutoModel from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor +from nemo.collections.audio.parts.utils.transforms import MelSpectrogram, Resample from nemo.collections.common.parts.utils import ClampActivation, HalfSnake, Snake, mask_sequence_tensor from nemo.core.classes.common import typecheck from nemo.core.classes.module import NeuralModule @@ -39,13 +40,6 @@ from nemo.core.neural_types.neural_type import NeuralType from nemo.utils import logging -try: - import torchaudio - - HAVE_TORCHAUDIO = True -except ModuleNotFoundError: - HAVE_TORCHAUDIO = False - try: import fsspec @@ -121,11 +115,7 @@ def __init__( ): super().__init__() - if HAVE_TORCHAUDIO: - self.resample = torchaudio.transforms.Resample(input_sr, slm_sr) - else: - self.resample = None - + self.resample = Resample(orig_freq=input_sr, new_freq=slm_sr) self.slm_model = SSLModel(slm_model_name) # Freeze slm model @@ -353,11 +343,7 @@ def __init__( self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2)) self.instancenorm = nn.InstanceNorm1d(input_dim) - - if self.use_torch_spec and HAVE_TORCHAUDIO: - self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config) - else: - self.torch_spec = None + self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config) if self.use_torch_spec else None outmap_size = int(self.input_dim / 8) @@ -460,7 +446,7 @@ def forward(self, x, l2_norm=False): def get_torch_mel_spectrogram_class(self, audio_config): return torch.nn.Sequential( PreEmphasis(audio_config["preemphasis"]), - torchaudio.transforms.MelSpectrogram( + MelSpectrogram( sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], diff --git a/scripts/installers/install_torchaudio_latest.sh b/scripts/installers/install_torchaudio_latest.sh deleted file mode 100755 index bdad771fe267..000000000000 --- a/scripts/installers/install_torchaudio_latest.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Torch and torchaudio versions must match. Othervise, there will be no CUDA support. -# See https://github.com/pytorch/audio/blob/f0bc00c980012badea8db011f84a0e9ef33ba6c1/README.md?plain=1#L66 - -DEPENDENCIES_INSTALL_CMD="apt update && apt install -y ffmpeg sox libavdevice-dev" - -read -r -d '' INFO_MESSAGE << EOM -INFO: This script is supposed to be used when building a docker container using Dockerfile in NeMo. -Use the script only for compiling torchaudio from scratch with a Non-Standard PyTorch version (e.g., 2.1.0a0+32f93b1) -For the release PyTorch version (e.g., 2.1.0), use 'pip install torchaudio' instead. -If running stand-alone, install dependencies first: '${DEPENDENCIES_INSTALL_CMD}' -EOM - -echo "$INFO_MESSAGE" - -for lib in libavdevice sox; do - if ! grep -q ${lib} <<< "$(ldconfig -p)"; then - echo "ERROR: ${lib} not found. Install dependencies before running the script: '${DEPENDENCIES_INSTALL_CMD}'" - exit 1 - fi -done - -if ! command -v ffmpeg &> /dev/null; then - echo "ERROR: ffmpeg not found. Install dependencies before running the script: '${DEPENDENCIES_INSTALL_CMD}'" - exit 1 -fi - -TORCHAUDIO_REPO=https://github.com/pytorch/audio -# expected LATEST_RELEASE=release/*.** -LATEST_RELEASE=$(git -c 'versionsort.suffix=-' \ - ls-remote --exit-code --refs --sort='version:refname' --heads ${TORCHAUDIO_REPO} 'release/*.*' \ - | tail --lines=1 \ - | cut -d '/' -f 3,4) -TORCHAUDIO_LATEST_MAJOR_VERSION=$(python3 -c "major_version = (\"${LATEST_RELEASE}\".split('/')[-1]).split('.')[0]; print(major_version)") -TORCHAUDIO_LATEST_MINOR_VERSION=$(python3 -c "minor_version = \"${LATEST_RELEASE}\".rsplit('.')[-1]; print(minor_version)") - -# avoid checking PYTORCH_VERSION variable, not available everywhere -TORCH_FULL_VERSION=$(python3 -c "import torch; print(torch.__version__)") -TORCH_MAIN_VERSION=$(python3 -c "import torch, re; print(re.search(r'(\d+\.?)+', torch.__version__).group(0))") -TORCH_MAJOR_VERSION=$(python3 -c "major_version = \"${TORCH_MAIN_VERSION}\".split('.')[0]; print(major_version)") -TORCH_MINOR_VERSION=$(python3 -c "minor_version = \"${TORCH_MAIN_VERSION}\".split('.')[1]; print(minor_version)") -TORCH_FIX_VERSION=$(python3 -c "minor_version = \"${TORCH_MAIN_VERSION}\".split('.')[2]; print(minor_version)") - - -echo "Latest torchaudio release: ${TORCHAUDIO_LATEST_MAJOR_VERSION}.${TORCHAUDIO_LATEST_MINOR_VERSION}" -echo "Pytorch version: ${TORCH_MAIN_VERSION:0:6}" - -if [[ $TORCH_MAJOR_VERSION -eq 1 ]]; then - if [[ $TORCH_MINOR_VERSION -le 13 ]]; then - INSTALL_BRANCH="release/0.${TORCH_MINOR_VERSION}" - else - # fix for PyTorch 1.14 (no official release) - INSTALL_BRANCH="release/2.0" - fi - TORCHAUDIO_MAJOR_VERSION=0 -else # version 2 expected - TORCHAUDIO_MAJOR_VERSION=${TORCH_MAJOR_VERSION} - INSTALL_BRANCH="release/${TORCH_MAJOR_VERSION}.${TORCH_MINOR_VERSION}" -fi - - -# check if install branch exists -if [[ $(git ls-remote --heads ${TORCHAUDIO_REPO} ${INSTALL_BRANCH} | wc -l) -eq 0 ]] -then - echo "Branch ${INSTALL_BRANCH} does not exist in torchaudio repo. Using latest release." - INSTALL_BRANCH=${LATEST_RELEASE} -fi - -# expected TORCHAUDIO_BUILD_VERSION=*.**.* -TORCHAUDIO_BUILD_VERSION="${TORCHAUDIO_MAJOR_VERSION}.${TORCH_MINOR_VERSION}.${TORCH_FIX_VERSION}" - -echo "Torchaudio build version: ${TORCHAUDIO_BUILD_VERSION}" -echo "Installing torchaudio from branch: ${INSTALL_BRANCH}" - -# we need parameterized to run torchaudio tests -# suppose that we do not have parameterized installed yet -pip install parameterized - -# Build torchaudio and run MFCC test -# NB: setting PYTORCH_VERSION is a workaround for the case where PYTORCH_VERSION is set, but contains incorrect value -# e.g., in container nvcr.io/nvidia/pytorch:24.03-py3 -git clone --depth 1 --branch ${INSTALL_BRANCH} https://github.com/pytorch/audio.git && \ -cd audio && \ -git submodule update --init --recursive && \ -PYTORCH_VERSION=${TORCH_FULL_VERSION} USE_FFMPEG=1 BUILD_SOX=1 BUILD_VERSION=${TORCHAUDIO_BUILD_VERSION} python setup.py install && \ -cd .. && \ -pytest -rs audio/test/torchaudio_unittest/transforms/torchscript_consistency_cpu_test.py -k 'test_MFCC' || \ -{ echo "ERROR: Failed to install torchaudio!"; exit 1; }; -# RNNT loss is built with CUDA, so checking it will suffice -# This test will be skipped if CUDA is not available (e.g. when building from docker) -pytest -rs audio/test/torchaudio_unittest/functional/torchscript_consistency_cuda_test.py -k 'test_rnnt_loss' || \ -echo "WARNING: Failed to install torchaudio with CUDA support!"; -rm -rf audio && \ -echo "Torchaudio installed successfully!" diff --git a/tests/collections/audio/test_audio_losses.py b/tests/collections/audio/test_audio_losses.py index 61875751796d..33d25bd8f182 100644 --- a/tests/collections/audio/test_audio_losses.py +++ b/tests/collections/audio/test_audio_losses.py @@ -29,16 +29,6 @@ convolution_invariant_target, scale_invariant_target, ) - -try: - import importlib - - importlib.import_module('torchaudio') - - HAVE_TORCHAUDIO = True -except ModuleNotFoundError: - HAVE_TORCHAUDIO = False - from nemo.collections.audio.losses.maxine import CombinedLoss from nemo.collections.audio.parts.utils.audio import ( calculate_sdr_numpy, @@ -1089,7 +1079,6 @@ def test_mae_invalid_ndim(self): MAELoss(ndim=5) @pytest.mark.unit - @pytest.mark.skipif(not HAVE_TORCHAUDIO, reason="Modules in this test require torchaudio") def test_maxine_combined_loss(self, test_data_dir): INPUT_LOCATION = os.path.join(test_data_dir, 'audio', 'maxine', 'input.bin') ATOL = 1e-2 diff --git a/tests/collections/audio/test_audio_maxine_models.py b/tests/collections/audio/test_audio_maxine_models.py index 9c54a06ef0a1..5ae6793edf89 100644 --- a/tests/collections/audio/test_audio_maxine_models.py +++ b/tests/collections/audio/test_audio_maxine_models.py @@ -16,15 +16,6 @@ import torch from omegaconf import DictConfig -try: - import importlib - - importlib.import_module('torchaudio') - - HAVE_TORCHAUDIO = True -except ModuleNotFoundError: - HAVE_TORCHAUDIO = False - from nemo.collections.audio.models.maxine import BNR2 @@ -81,7 +72,6 @@ class TestBNR2Model: """Test BNR 2 model.""" @pytest.mark.unit - @pytest.mark.skipif(not HAVE_TORCHAUDIO, reason="Modules in this test require torchaudio") def test_constructor(self, maxine_model_fixture): """Test that the model can be constructed from a config dict.""" model = maxine_model_fixture.train() @@ -90,7 +80,6 @@ def test_constructor(self, maxine_model_fixture): assert isinstance(instance2, BNR2) @pytest.mark.unit - @pytest.mark.skipif(not HAVE_TORCHAUDIO, reason="Modules in this test require torchaudio") @pytest.mark.parametrize( "batch_size, sample_len", [ diff --git a/tests/collections/audio/test_audio_metrics.py b/tests/collections/audio/test_audio_metrics.py index 578b67fc2479..0e221c37e7a5 100644 --- a/tests/collections/audio/test_audio_metrics.py +++ b/tests/collections/audio/test_audio_metrics.py @@ -17,6 +17,7 @@ from nemo.collections.audio.metrics.audio import AudioMetricWrapper from nemo.collections.audio.metrics.squim import SquimMOSMetric, SquimObjectiveMetric +from nemo.collections.audio.parts.utils.transforms import Resample try: import torchaudio @@ -165,7 +166,7 @@ def test_squim_mos(self, fs: int): squim_mos_metric = SquimMOSMetric(fs=fs) # Helper function - resampler = torchaudio.transforms.Resample( + resampler = Resample( orig_freq=fs, new_freq=16000, lowpass_filter_width=64, @@ -222,7 +223,7 @@ def test_squim_objective(self, metric: str, fs: int): squim_objective_metric = SquimObjectiveMetric(fs=fs, metric=metric) # Helper function - resampler = torchaudio.transforms.Resample( + resampler = Resample( orig_freq=fs, new_freq=16000, lowpass_filter_width=64, diff --git a/tests/collections/audio/test_audio_models_flow_matching.py b/tests/collections/audio/test_audio_models_flow_matching.py index 65ddd564e478..b24424c7570a 100644 --- a/tests/collections/audio/test_audio_models_flow_matching.py +++ b/tests/collections/audio/test_audio_models_flow_matching.py @@ -24,7 +24,7 @@ import torch from omegaconf import DictConfig -from nemo.collections.audio.models import FlowMatchingAudioToAudioModel +from nemo.collections.audio.models.enhancement import FlowMatchingAudioToAudioModel def convert_to_dictconfig(d): @@ -79,7 +79,7 @@ def flow_matching_base_config(request): 'time_max': flow['time_max'], } - loss = {'_target_': 'nemo.collections.audio.losses.MSELoss', 'ndim': 4} + loss = {'_target_': 'nemo.collections.audio.losses.audio.MSELoss', 'ndim': 4} estimator = { '_target_': 'nemo.collections.audio.parts.submodules.transformerunet.SpectrogramTransformerUNet', diff --git a/tests/collections/audio/test_audio_models_mask.py b/tests/collections/audio/test_audio_models_mask.py index f847fcbaf313..a6d766818f98 100644 --- a/tests/collections/audio/test_audio_models_mask.py +++ b/tests/collections/audio/test_audio_models_mask.py @@ -25,7 +25,7 @@ import torch from omegaconf import DictConfig -from nemo.collections.audio.models import EncMaskDecAudioToAudioModel +from nemo.collections.audio.models.enhancement import EncMaskDecAudioToAudioModel @pytest.fixture(params=["nemo_manifest", "lhotse_cuts"]) @@ -111,7 +111,7 @@ def mask_model_rnn_params(): } loss = { - '_target_': 'nemo.collections.audio.losses.SDRLoss', + '_target_': 'nemo.collections.audio.losses.audio.SDRLoss', 'scale_invariant': True, } @@ -212,7 +212,7 @@ def mask_model_flexarray(): } loss = { - '_target_': 'nemo.collections.audio.losses.SDRLoss', + '_target_': 'nemo.collections.audio.losses.audio.SDRLoss', 'scale_invariant': True, } diff --git a/tests/collections/audio/test_audio_models_predictive.py b/tests/collections/audio/test_audio_models_predictive.py index b02f7e810ed1..55a688dbdccc 100644 --- a/tests/collections/audio/test_audio_models_predictive.py +++ b/tests/collections/audio/test_audio_models_predictive.py @@ -23,7 +23,7 @@ import torch from omegaconf import DictConfig -from nemo.collections.audio.models import PredictiveAudioToAudioModel +from nemo.collections.audio.models.enhancement import PredictiveAudioToAudioModel @pytest.fixture(params=["nemo_manifest", "lhotse_cuts"]) @@ -111,7 +111,7 @@ def predictive_model_ncsn(): } loss = { - '_target_': 'nemo.collections.audio.losses.MSELoss', # computed in the time domain + '_target_': 'nemo.collections.audio.losses.audio.MSELoss', # computed in the time domain } model_config = DictConfig( @@ -183,7 +183,7 @@ def predictive_model_conformer(): } loss = { - '_target_': 'nemo.collections.audio.losses.MSELoss', # computed in the time domain + '_target_': 'nemo.collections.audio.losses.audio.MSELoss', # computed in the time domain } model_config = DictConfig( @@ -255,7 +255,7 @@ def predictive_model_streaming_conformer(): } loss = { - '_target_': 'nemo.collections.audio.losses.MSELoss', # computed in the time domain + '_target_': 'nemo.collections.audio.losses.audio.MSELoss', # computed in the time domain } model_config = DictConfig( @@ -318,7 +318,7 @@ def predictive_model_transformer_unet_params_base(): } loss = { - '_target_': 'nemo.collections.audio.losses.MSELoss', # computed in the time domain + '_target_': 'nemo.collections.audio.losses.audio.MSELoss', # computed in the time domain } model_config = DictConfig( @@ -384,7 +384,7 @@ def predictive_model_conformer_unet(): } loss = { - '_target_': 'nemo.collections.audio.losses.MSELoss', # computed in the time domain + '_target_': 'nemo.collections.audio.losses.audio.MSELoss', # computed in the time domain } model_config = DictConfig( @@ -456,7 +456,7 @@ def predictive_model_streaming_conformer_unet(): } loss = { - '_target_': 'nemo.collections.audio.losses.MSELoss', # computed in the time domain + '_target_': 'nemo.collections.audio.losses.audio.MSELoss', # computed in the time domain } model_config = DictConfig( diff --git a/tests/collections/audio/test_audio_models_schroedinger_bridge.py b/tests/collections/audio/test_audio_models_schroedinger_bridge.py index 00018764f927..6d4a228092b3 100644 --- a/tests/collections/audio/test_audio_models_schroedinger_bridge.py +++ b/tests/collections/audio/test_audio_models_schroedinger_bridge.py @@ -24,7 +24,7 @@ import torch from omegaconf import DictConfig -from nemo.collections.audio.models import SchroedingerBridgeAudioToAudioModel +from nemo.collections.audio.models.enhancement import SchroedingerBridgeAudioToAudioModel @pytest.fixture(params=["nemo_manifest", "lhotse_cuts"]) @@ -112,9 +112,12 @@ def schroedinger_bridge_model_ncsn_params(): 'pad_dimension_to': 0, # no padding in the frequency dimension } - loss_encoded = {'_target_': 'nemo.collections.audio.losses.MSELoss', 'ndim': 4} # computed in the time domain + loss_encoded = { + '_target_': 'nemo.collections.audio.losses.audio.MSELoss', + 'ndim': 4, + } # computed in the time domain - loss_time = {'_target_': 'nemo.collections.audio.losses.MAELoss'} + loss_time = {'_target_': 'nemo.collections.audio.losses.audio.MAELoss'} noise_schedule = { '_target_': 'nemo.collections.audio.parts.submodules.schroedinger_bridge.SBNoiseScheduleVE', diff --git a/tests/collections/audio/test_audio_models_score_based.py b/tests/collections/audio/test_audio_models_score_based.py index 7028c3d285f7..03942ebdfb2f 100644 --- a/tests/collections/audio/test_audio_models_score_based.py +++ b/tests/collections/audio/test_audio_models_score_based.py @@ -24,7 +24,7 @@ import torch from omegaconf import DictConfig -from nemo.collections.audio.models import ScoreBasedGenerativeAudioToAudioModel +from nemo.collections.audio.models.enhancement import ScoreBasedGenerativeAudioToAudioModel def convert_to_dictconfig(d): @@ -87,7 +87,7 @@ def score_based_base_config(): 'snr': 0.5, } - loss = {'_target_': 'nemo.collections.audio.losses.MSELoss', 'ndim': 4} + loss = {'_target_': 'nemo.collections.audio.losses.audio.MSELoss', 'ndim': 4} trainer = { 'max_epochs': -1, diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb index 10e0e392da6a..c221b43640ef 100644 --- a/tutorials/00_NeMo_Primer.ipynb +++ b/tutorials/00_NeMo_Primer.ipynb @@ -45,9 +45,6 @@ "BRANCH = 'main'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", - "## Install TorchAudio\n", - "!pip install torchaudio>=0.10.0 -f https://download.pytorch.org/whl/torch_stable.html\n", - "\n", "## Grab the config we'll use in this example\n", "!mkdir configs" ] @@ -795,7 +792,7 @@ }, "outputs": [], "source": [ - "!ls -d -- *.nemo " + "!ls -d -- *.nemo" ] }, { diff --git a/tutorials/01_NeMo_Models.ipynb b/tutorials/01_NeMo_Models.ipynb index 7d80c8e96de5..9d0967474cff 100644 --- a/tutorials/01_NeMo_Models.ipynb +++ b/tutorials/01_NeMo_Models.ipynb @@ -28,9 +28,6 @@ "BRANCH = 'main'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", - "## Install TorchAudio\n", - "!pip install torchaudio>=0.10.0 -f https://download.pytorch.org/whl/torch_stable.html\n", - "\n", "## Grab the config we'll use in this example\n", "!mkdir configs" ] @@ -860,7 +857,7 @@ "source": [ "# Custom element types are now imported from helper_files.gpt_components:\n", "# - AttentionType(EncodedRepresentation): Basic Attention Element Type\n", - "# - SelfAttentionType(AttentionType): Self Attention Element Type \n", + "# - SelfAttentionType(AttentionType): Self Attention Element Type\n", "# - CausalSelfAttentionType(SelfAttentionType): Causal Self Attention Element Type\n", "print(\"Custom element types imported successfully!\")" ] @@ -1192,7 +1189,7 @@ "\n", "# Example instantiation (with dummy parameters for demonstration)\n", "dummy_decoder = GPTDecoder(n_embd=32, vocab_size=100)\n", - "print(f\"Input types: {dummy_decoder.input_types}\") \n", + "print(f\"Input types: {dummy_decoder.input_types}\")\n", "print(f\"Output types: {dummy_decoder.output_types}\")\n" ] }, @@ -1305,7 +1302,7 @@ "# block_size: int # length of the model's context window in time\n", "# n_layer: int # depth of the model; number of Transformer blocks in sequence\n", "# n_embd: int # the \"width\" of the model, number of channels in each Transformer\n", - "# n_head: int # number of heads in each multi-head attention inside each Transformer block \n", + "# n_head: int # number of heads in each multi-head attention inside each Transformer block\n", "\n", "# model definition args (optional)\n", "# ================================\n", @@ -1681,10 +1678,10 @@ "\n", " def setup_training_data(self, train_data_config: OmegaConf):\n", " self._train_dl = None\n", - " \n", + "\n", " def setup_validation_data(self, val_data_config: OmegaConf):\n", " self._validation_dl = None\n", - " \n", + "\n", " def setup_test_data(self, test_data_config: OmegaConf):\n", " self._test_dl = None" ] @@ -1757,7 +1754,7 @@ "\n", " def test_step(self, *args, **kwargs):\n", " return self.step_('test', *args, **kwargs)\n", - " \n", + "\n", " # This is useful for multiple validation data loader setup\n", " def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0):\n", " val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean()\n", @@ -2159,14 +2156,14 @@ " pin_memory=cfg.pin_memory if 'pin_memory' in cfg else False,\n", " num_workers=cfg.num_workers if 'num_workers' in cfg else 0\n", " )\n", - " \n", + "\n", " def setup_training_data(self, train_data_config: OmegaConf):\n", " self.vocab = None\n", " self._train_dl = self._setup_data_loader(train_data_config)\n", - " \n", + "\n", " def setup_validation_data(self, val_data_config: OmegaConf):\n", " self._validation_dl = self._setup_data_loader(val_data_config)\n", - " \n", + "\n", " def setup_test_data(self, test_data_config: OmegaConf):\n", " self._test_dl = self._setup_data_loader(test_data_config)\n" ] @@ -2414,7 +2411,7 @@ "outputs": [], "source": [ "class NeMoGPTv2(NeMoGPT):\n", - " \n", + "\n", " def setup_training_data(self, train_data_config: OmegaConf):\n", " self.vocab = None\n", " self._train_dl = self._setup_data_loader(train_data_config)\n", @@ -2423,25 +2420,25 @@ " with open('vocab.txt', 'w') as f:\n", " for token in self.vocab:\n", " f.write(f\"{token}\")\n", - " \n", + "\n", " # This is going to register the file into .nemo!\n", " # When you later use .save_to(), it will copy this file into the tar file.\n", " self.register_artifact('vocab_file', 'vocab.txt')\n", - " \n", + "\n", " def setup_validation_data(self, val_data_config: OmegaConf):\n", - " # This is going to try to find the same file, and if it fails, \n", + " # This is going to try to find the same file, and if it fails,\n", " # it will use the copy in .nemo\n", " vocab_file = self.register_artifact('vocab_file', 'vocab.txt')\n", - " \n", + "\n", " with open(vocab_file, 'r') as f:\n", " vocab = []\n", " vocab = f.read().split('')[:-1] # the -1 here is for the dangling token in the file\n", " self.vocab = vocab\n", "\n", " self._validation_dl = self._setup_data_loader(val_data_config)\n", - " \n", + "\n", " def setup_test_data(self, test_data_config: OmegaConf):\n", - " # This is going to try to find the same file, and if it fails, \n", + " # This is going to try to find the same file, and if it fails,\n", " # it will use the copy in .nemo\n", " vocab_file = self.register_artifact('vocab_file', 'vocab.txt')\n", "\n", diff --git a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb index 1f1cdbc2a48d..db0159977e84 100644 --- a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb +++ b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb @@ -7,9 +7,9 @@ "outputs": [], "source": [ "\"\"\"\n", - "Please run notebook locally (if you have all the dependencies and a GPU). \n", + "Please run notebook locally (if you have all the dependencies and a GPU).\n", "Technically you can run this notebook on Google Colab but you need to set up microphone for Colab.\n", - " \n", + "\n", "Instructions for setting up Colab are as follows:\n", "1. Open a new Python 3 notebook.\n", "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", @@ -30,10 +30,7 @@ "\n", "# ## Install NeMo\n", "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", - "\n", - "## Install TorchAudio\n", - "!pip install torchaudio>=0.13.0 -f https://download.pytorch.org/whl/torch_stable.html" + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]" ] }, { @@ -65,13 +62,6 @@ "```" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook requires the `torchaudio` library to be installed for MarbleNet. Please follow the instructions available at the [torchaudio installer](https://github.com/NVIDIA/NeMo/blob/main/scripts/installers/install_torchaudio_latest.sh) and [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.\n" - ] - }, { "cell_type": "code", "execution_count": null, @@ -315,17 +305,17 @@ " super().__init__()\n", " self._sample_rate = sample_rate\n", " self.output = True\n", - " \n", + "\n", " def __iter__(self):\n", " return self\n", - " \n", + "\n", " def __next__(self):\n", " if not self.output:\n", " raise StopIteration\n", " self.output = False\n", " return torch.as_tensor(self.signal, dtype=torch.float32), \\\n", " torch.as_tensor(self.signal_shape, dtype=torch.int64)\n", - " \n", + "\n", " def set_signal(self, signal):\n", " self.signal = signal.astype(np.float32)/32768.\n", " self.signal_shape = self.signal.size\n", @@ -373,10 +363,10 @@ "# contiguous signal's frames\n", "# To simplify the flow, we use single threshold to binarize predictions.\n", "class FrameVAD:\n", - " \n", + "\n", " def __init__(self, model_definition,\n", " threshold=0.5,\n", - " frame_len=2, frame_overlap=2.5, \n", + " frame_len=2, frame_overlap=2.5,\n", " offset=10):\n", " '''\n", " Args:\n", @@ -387,7 +377,7 @@ " '''\n", " self.vocab = list(model_definition['labels'])\n", " self.vocab.append('_')\n", - " \n", + "\n", " self.sr = model_definition['sample_rate']\n", " self.threshold = threshold\n", " self.frame_len = frame_len\n", @@ -401,7 +391,7 @@ " dtype=np.float32)\n", " self.offset = offset\n", " self.reset()\n", - " \n", + "\n", " def _decode(self, frame, offset=0):\n", " assert len(frame)==self.n_frame_len\n", " self.buffer[:-self.n_frame_len] = self.buffer[self.n_frame_len:]\n", @@ -412,9 +402,9 @@ " logits,\n", " self.vocab\n", " )\n", - " return decoded \n", - " \n", - " \n", + " return decoded\n", + "\n", + "\n", " @torch.no_grad()\n", " def transcribe(self, frame=None):\n", " if frame is None:\n", @@ -423,7 +413,7 @@ " frame = np.pad(frame, [0, self.n_frame_len - len(frame)], 'constant')\n", " unmerged = self._decode(frame, self.offset)\n", " return unmerged\n", - " \n", + "\n", " def reset(self):\n", " '''\n", " Reset frame_history and decoder's state\n", @@ -471,14 +461,14 @@ "import wave\n", "\n", "def offline_inference(wave_file, STEP = 0.025, WINDOW_SIZE = 0.5, threshold=0.5):\n", - " \n", - " FRAME_LEN = STEP # infer every STEP seconds \n", + "\n", + " FRAME_LEN = STEP # infer every STEP seconds\n", " CHANNELS = 1 # number of audio channels (expect mono signal)\n", " RATE = 16000 # sample rate, Hz\n", - " \n", - " \n", + "\n", + "\n", " CHUNK_SIZE = int(FRAME_LEN*RATE)\n", - " \n", + "\n", " vad = FrameVAD(model_definition = {\n", " 'sample_rate': SAMPLE_RATE,\n", " 'AudioToMFCCPreprocessor': cfg.preprocessor,\n", @@ -509,7 +499,7 @@ " preds.append(result[0])\n", " proba_b.append(result[2])\n", " proba_s.append(result[3])\n", - " \n", + "\n", " if len(result):\n", " print(result,end='\\n')\n", " empty_counter = 3\n", @@ -517,10 +507,10 @@ " empty_counter -= 1\n", " if empty_counter == 0:\n", " print(' ',end='')\n", - " \n", + "\n", " p.terminate()\n", " vad.reset()\n", - " \n", + "\n", " return preds, proba_b, proba_s" ] }, @@ -542,7 +532,7 @@ "source": [ "demo_wave = 'VAD_demo.wav'\n", "if not os.path.exists(demo_wave):\n", - " !wget \"https://dldata-public.s3.us-east-2.amazonaws.com/VAD_demo.wav\" " + " !wget \"https://dldata-public.s3.us-east-2.amazonaws.com/VAD_demo.wav\"" ] }, { @@ -612,12 +602,12 @@ "\n", "num = len(results)\n", "for i in range(num):\n", - " len_pred = len(results[i][2]) \n", + " len_pred = len(results[i][2])\n", " FRAME_LEN = results[i][0]\n", " ax1 = plt.subplot(num+1,1,i+1)\n", "\n", " ax1.plot(np.arange(audio.size) / sample_rate, audio, 'b')\n", - " ax1.set_xlim([-0.01, int(dur)+1]) \n", + " ax1.set_xlim([-0.01, int(dur)+1])\n", " ax1.tick_params(axis='y', labelcolor= 'b')\n", " ax1.set_ylabel('Signal')\n", " ax1.set_ylim([-1, 1])\n", @@ -633,8 +623,8 @@ "\n", " ax2.set_title(f'step {results[i][0]}s, buffer size {results[i][1]}s')\n", " ax2.set_ylabel('Preds and Probas')\n", - " \n", - " \n", + "\n", + "\n", "ax = plt.subplot(num+1,1,num+1)\n", "S = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=64, fmax=8000)\n", "S_dB = librosa.power_to_db(S, ref=np.max)\n", @@ -664,9 +654,9 @@ "metadata": {}, "outputs": [], "source": [ - "STEP = 0.01 \n", + "STEP = 0.01\n", "WINDOW_SIZE = 0.31\n", - "CHANNELS = 1 \n", + "CHANNELS = 1\n", "RATE = 16000\n", "FRAME_LEN = STEP\n", "THRESHOLD = 0.5\n", @@ -679,7 +669,7 @@ " 'labels': cfg.labels\n", " },\n", " threshold=THRESHOLD,\n", - " frame_len=FRAME_LEN, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2, \n", + " frame_len=FRAME_LEN, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2,\n", " offset=0)\n" ] }, @@ -732,19 +722,19 @@ " print('Listening...')\n", "\n", " stream.start_stream()\n", - " \n", + "\n", " # Interrupt kernel and then speak for a few more words to exit the pyaudio loop !\n", " try:\n", " while stream.is_active():\n", " time.sleep(0.1)\n", - " finally: \n", + " finally:\n", " stream.stop_stream()\n", " stream.close()\n", " p.terminate()\n", "\n", " print()\n", " print(\"PyAudio stopped\")\n", - " \n", + "\n", "else:\n", " print(\"ERROR: No audio input device found, please check if the jupyter notebook has access to your computer's microphone.\")" ] diff --git a/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb b/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb index 858f162b1834..c7dec1ce8811 100644 --- a/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb +++ b/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb @@ -9,16 +9,18 @@ "outputs": [], "source": [ "\"\"\"\n", - "Please run notebook locally (if you have all the dependencies and a GPU). \n", + "Please run notebook locally (if you have all the dependencies and a GPU).\n", "Technically you can run this notebook on Google Colab but you need to set up microphone for Colab.\n", - " \n", + "\n", "Instructions for setting up Colab are as follows:\n", "1. Open a new Python 3 notebook.\n", "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "5. Set up microphone for Colab\n", - "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", + "\n", + "\n", + "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", "\"\"\"\n", "# If you're using Google Colab and not running locally, run this cell.\n", "\n", @@ -30,10 +32,7 @@ "\n", "# ## Install NeMo\n", "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", - "\n", - "## Install TorchAudio\n", - "!pip install torchaudio>=0.13.0 -f https://download.pytorch.org/whl/torch_stable.html" + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]" ] }, { @@ -55,13 +54,6 @@ "```" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook requires the `torchaudio` library to be installed for MatchboxNet. Please follow the instructions available at the [torchaudio installer](https://github.com/NVIDIA/NeMo/blob/main/scripts/installers/install_torchaudio_latest.sh) and [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.\n" - ] - }, { "cell_type": "code", "execution_count": null, @@ -243,17 +235,17 @@ " super().__init__()\n", " self._sample_rate = sample_rate\n", " self.output = True\n", - " \n", + "\n", " def __iter__(self):\n", " return self\n", - " \n", + "\n", " def __next__(self):\n", " if not self.output:\n", " raise StopIteration\n", " self.output = False\n", " return torch.as_tensor(self.signal, dtype=torch.float32), \\\n", " torch.as_tensor(self.signal_shape, dtype=torch.int64)\n", - " \n", + "\n", " def set_signal(self, signal):\n", " self.signal = signal.astype(np.float32)/32768.\n", " self.signal_shape = self.signal.size\n", @@ -313,9 +305,9 @@ "# 2) call transcribe(frame) to do ASR on\n", "# contiguous signal's frames\n", "class FrameASR:\n", - " \n", + "\n", " def __init__(self, model_definition,\n", - " frame_len=2, frame_overlap=2.5, \n", + " frame_len=2, frame_overlap=2.5,\n", " offset=0):\n", " '''\n", " Args:\n", @@ -325,7 +317,7 @@ " '''\n", " self.task = model_definition['task']\n", " self.vocab = list(model_definition['labels'])\n", - " \n", + "\n", " self.sr = model_definition['sample_rate']\n", " self.frame_len = frame_len\n", " self.n_frame_len = int(frame_len * self.sr)\n", @@ -338,7 +330,7 @@ " dtype=np.float32)\n", " self.offset = offset\n", " self.reset()\n", - " \n", + "\n", " @torch.no_grad()\n", " def _decode(self, frame, offset=0):\n", " assert len(frame)==self.n_frame_len\n", @@ -348,16 +340,16 @@ " if self.task == 'mbn':\n", " logits = infer_signal(mbn_model, self.buffer).to('cpu').numpy()[0]\n", " decoded = self._mbn_greedy_decoder(logits, self.vocab)\n", - " \n", + "\n", " elif self.task == 'vad':\n", " logits = infer_signal(vad_model, self.buffer).to('cpu').numpy()[0]\n", " decoded = self._vad_greedy_decoder(logits, self.vocab)\n", - " \n", + "\n", " else:\n", " raise(\"Task should either be of mbn or vad!\")\n", - " \n", + "\n", " return decoded[:len(decoded)-offset]\n", - " \n", + "\n", " def transcribe(self, frame=None,merge=False):\n", " if frame is None:\n", " frame = np.zeros(shape=self.n_frame_len, dtype=np.float32)\n", @@ -365,8 +357,8 @@ " frame = np.pad(frame, [0, self.n_frame_len - len(frame)], 'constant')\n", " unmerged = self._decode(frame, self.offset)\n", " return unmerged\n", - " \n", - " \n", + "\n", + "\n", " def reset(self):\n", " '''\n", " Reset frame_history and decoder's state\n", @@ -374,17 +366,17 @@ " self.buffer=np.zeros(shape=self.buffer.shape, dtype=np.float32)\n", " self.mbn_s = []\n", " self.vad_s = []\n", - " \n", + "\n", " @staticmethod\n", " def _mbn_greedy_decoder(logits, vocab):\n", " mbn_s = []\n", " if logits.shape[0]:\n", " class_idx = np.argmax(logits)\n", " class_label = vocab[class_idx]\n", - " mbn_s.append(class_label) \n", + " mbn_s.append(class_label)\n", " return mbn_s\n", - " \n", - " \n", + "\n", + "\n", " @staticmethod\n", " def _vad_greedy_decoder(logits, vocab):\n", " vad_s = []\n", @@ -439,16 +431,16 @@ " \"\"\"\n", " Arg:\n", " wav_file: wave file to be performed inference on.\n", - " STEP: infer every STEP seconds \n", + " STEP: infer every STEP seconds\n", " WINDOW_SIZE : length of audio to be sent to NN.\n", " \"\"\"\n", - " \n", - " FRAME_LEN = STEP \n", + "\n", + " FRAME_LEN = STEP\n", " CHANNELS = 1 # number of audio channels (expect mono signal)\n", " RATE = SAMPLE_RATE # sample rate, 16000 Hz\n", - " \n", + "\n", " CHUNK_SIZE = int(FRAME_LEN * SAMPLE_RATE)\n", - " \n", + "\n", " mbn = FrameASR(model_definition = {\n", " 'task': 'mbn',\n", " 'sample_rate': SAMPLE_RATE,\n", @@ -467,10 +459,10 @@ " data = wf.readframes(CHUNK_SIZE)\n", " signal = np.frombuffer(data, dtype=np.int16)\n", " mbn_result = mbn.transcribe(signal)\n", - " \n", + "\n", " if len(mbn_result):\n", " print(mbn_result)\n", - " \n", + "\n", " mbn.reset()" ] }, @@ -545,13 +537,13 @@ "metadata": {}, "outputs": [], "source": [ - "vad_threshold = 0.8 \n", + "vad_threshold = 0.8\n", "\n", - "STEP = 0.1 \n", + "STEP = 0.1\n", "WINDOW_SIZE = 0.15\n", "mbn_WINDOW_SIZE = 1\n", "\n", - "CHANNELS = 1 \n", + "CHANNELS = 1\n", "RATE = SAMPLE_RATE\n", "FRAME_LEN = STEP # use step of vad inference as frame len\n", "\n", @@ -563,7 +555,7 @@ " 'JasperEncoder': vad_cfg.encoder,\n", " 'labels': vad_cfg.labels\n", " },\n", - " frame_len=FRAME_LEN, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2, \n", + " frame_len=FRAME_LEN, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2,\n", " offset=0)\n", "\n", "mbn = FrameASR(model_definition = {\n", @@ -602,19 +594,19 @@ " print('Please type input device ID:')\n", " dev_idx = int(input())\n", "\n", - " \n", + "\n", " def callback(in_data, frame_count, time_info, status):\n", " \"\"\"\n", " callback function for streaming audio and performing inference\n", " \"\"\"\n", " signal = np.frombuffer(in_data, dtype=np.int16)\n", - " vad_result = vad.transcribe(signal) \n", - " mbn_result = mbn.transcribe(signal) \n", - " \n", + " vad_result = vad.transcribe(signal)\n", + " mbn_result = mbn.transcribe(signal)\n", + "\n", " if len(vad_result):\n", - " # if speech prob is higher than threshold, we decide it contains speech utterance \n", - " # and activate MatchBoxNet \n", - " if vad_result[3] >= vad_threshold: \n", + " # if speech prob is higher than threshold, we decide it contains speech utterance\n", + " # and activate MatchBoxNet\n", + " if vad_result[3] >= vad_threshold:\n", " print(mbn_result) # print mbn result when speech present\n", " else:\n", " print(\"no-speech\")\n", @@ -629,21 +621,21 @@ " stream_callback=callback,\n", " frames_per_buffer=CHUNK_SIZE)\n", "\n", - " \n", + "\n", " print('Listening...')\n", " stream.start_stream()\n", - " \n", + "\n", " # Interrupt kernel and then speak for a few more words to exit the pyaudio loop !\n", " try:\n", " while stream.is_active():\n", " time.sleep(0.1)\n", - " finally: \n", + " finally:\n", " stream.stop_stream()\n", " stream.close()\n", " p.terminate()\n", " print()\n", " print(\"PyAudio stopped\")\n", - " \n", + "\n", "else:\n", " print('ERROR: No audio input device found.')" ] diff --git a/tutorials/asr/Streaming_Multitalker_ASR.ipynb b/tutorials/asr/Streaming_Multitalker_ASR.ipynb index 9d9bb76bd8ef..53b1cfaf3190 100644 --- a/tutorials/asr/Streaming_Multitalker_ASR.ipynb +++ b/tutorials/asr/Streaming_Multitalker_ASR.ipynb @@ -27,10 +27,7 @@ "\n", "# ## Install NeMo\n", "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n", - "\n", - "## Install TorchAudio\n", - "!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html" + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]" ] }, { @@ -145,7 +142,7 @@ "import librosa\n", "\n", "sr = 16000\n", - "signal, sr = librosa.load(an4_audio, sr=sr) \n", + "signal, sr = librosa.load(an4_audio, sr=sr)\n", "\n", "fig, ax = plt.subplots(1, 1)\n", "fig.set_figwidth(20)\n", @@ -187,7 +184,7 @@ "import torch\n", "\n", "if get_hf_token() is not None and get_hf_token().startswith(\"hf_\"):\n", - " # If you have logged into HuggingFace hub and have access token \n", + " # If you have logged into HuggingFace hub and have access token\n", " diar_model = SortformerEncLabelModel.from_pretrained(\"nvidia/diar_streaming_sortformer_4spk-v2\")\n", "else:\n", " # You can download \".nemo\" file from https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2 and specify the path.\n", @@ -221,7 +218,7 @@ "import math\n", "import torch\n", "import torch.amp\n", - "from tqdm import tqdm \n", + "from tqdm import tqdm\n", "\n", "# If cuda is available, assign the model to cuda\n", "if torch.cuda.is_available():\n", @@ -350,9 +347,9 @@ "\n", " yticklabels = [\"spk0\", \"spk1\", \"spk2\", \"spk3\"]\n", " yticks = np.arange(len(yticklabels))\n", - " fig, axs = plt.subplots(1, 1, figsize=(30, 3)) \n", + " fig, axs = plt.subplots(1, 1, figsize=(30, 3))\n", "\n", - " axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest') \n", + " axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest')\n", " axs.set_title('Diarization Predictions (Speaker Activity)', fontsize=FS)\n", " axs.set_xticks(np.arange(-.5, preds_mat.shape[1], 1), minor=True)\n", " axs.set_yticks(yticks)\n", @@ -383,9 +380,9 @@ "source": [ "from nemo.collections.asr.models import ASRModel\n", "import torch\n", - " \n", + "\n", "if get_hf_token() is not None and get_hf_token().startswith(\"hf_\"):\n", - " # If you have logged into HuggingFace hub and have access token \n", + " # If you have logged into HuggingFace hub and have access token\n", " asr_model = ASRModel.from_pretrained(\"nvidia/multitalker-parakeet-streaming-0.6b-v1\")\n", "else:\n", " # You can download \".nemo\" file from https://huggingface.co/nvidia/multitalker-parakeet-streaming-0.6b-v1 and specify the path.\n", @@ -395,7 +392,7 @@ "asr_model.eval()\n", "if torch.cuda.is_available():\n", " asr_model.to(torch.device(\"cuda\"))\n", - " \n", + "\n", "print(\"ASR Model loaded successfully!\")" ] }, @@ -497,7 +494,7 @@ "\n", " # If `cuda` is a negative number, inference will be on CPU only.\n", " cuda: Optional[int] = None\n", - " allow_mps: bool = False \n", + " allow_mps: bool = False\n", " matmul_precision: str = \"highest\" # Literal[\"highest\", \"high\", \"medium\"]\n", "\n", " # ASR Configs\n", @@ -669,7 +666,7 @@ " drop_extra_pre_encoded=drop_extra_pre_encoded,\n", " )\n", " pprint(multispk_asr_streamer.instance_manager.batch_asr_states[0].seglsts)\n", - " \n", + "\n", "seglst_dict_list = multispk_asr_streamer.generate_seglst_dicts_from_parallel_streaming(samples=samples)\n", "\n", "from pprint import pprint\n", @@ -706,9 +703,9 @@ " end_time = seglst.get('end_time', 0.0)\n", " words = seglst.get('words', '')\n", " session_id = seglst.get('session_id', '')\n", - " \n", + "\n", " print(f\"[{idx+1}] {speaker} ({start_time:.2f}s - {end_time:.2f}s): {words}\")\n", - " \n", + "\n", " print(f\"\\n{'-'*80}\")\n", " print(f\"Total segments: {len(seglst_dict_list)}\")\n", "else:\n", diff --git a/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb b/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb index cdcfcc82b786..b8fab09b5787 100644 --- a/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb +++ b/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb @@ -31,10 +31,7 @@ "\n", "# ## Install NeMo\n", "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n", - "\n", - "## Install TorchAudio\n", - "!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html" + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]" ] }, { @@ -144,7 +141,7 @@ " plt.axis([0,len(signal),-0.5,+0.5])\n", " time_axis,_ = plt.xticks();\n", " plt.xticks(time_axis[:-1],time_axis[:-1]/sample_rate);\n", - " \n", + "\n", "COLORS=\"b g c m y\".split()\n", "\n", "def get_color(signal,speech_labels,sample_rate=16000):\n", @@ -157,8 +154,8 @@ " else:\n", " code = COLORS[int(label.split('_')[-1])]\n", " c[start:end]=code\n", - " \n", - " return c " + "\n", + " return c" ] }, { @@ -238,18 +235,18 @@ "metadata": {}, "outputs": [], "source": [ - "# Create a manifest file for input with below format. \n", - "# {\"audio_filepath\": \"/path/to/audio_file\", \"offset\": 0, \"duration\": null, \"label\": \"infer\", \"text\": \"-\", \n", + "# Create a manifest file for input with below format.\n", + "# {\"audio_filepath\": \"/path/to/audio_file\", \"offset\": 0, \"duration\": null, \"label\": \"infer\", \"text\": \"-\",\n", "# \"num_speakers\": null, \"rttm_filepath\": \"/path/to/rttm/file\", \"uem_filepath\"=\"/path/to/uem/filepath\"}\n", "import json\n", "meta = {\n", - " 'audio_filepath': AUDIO_FILENAME, \n", - " 'offset': 0, \n", - " 'duration':None, \n", - " 'label': 'infer', \n", - " 'text': '-', \n", - " 'num_speakers': None, \n", - " 'rttm_filepath': None, \n", + " 'audio_filepath': AUDIO_FILENAME,\n", + " 'offset': 0,\n", + " 'duration':None,\n", + " 'label': 'infer',\n", + " 'text': '-',\n", + " 'num_speakers': None,\n", + " 'rttm_filepath': None,\n", " 'uem_filepath' : None\n", "}\n", "with open(os.path.join(data_dir,'input_manifest.json'),'w') as fp:\n", @@ -279,10 +276,10 @@ "cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model\n", "cfg.diarizer.clustering.parameters.oracle_num_speakers=False\n", "\n", - "# Using Neural VAD and Conformer ASR \n", + "# Using Neural VAD and Conformer ASR\n", "cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'\n", "cfg.diarizer.asr.model_path = 'stt_en_conformer_ctc_large'\n", - "cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD \n", + "cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD\n", "cfg.diarizer.asr.parameters.asr_based_vad = False" ] }, @@ -576,13 +573,13 @@ "metadata": {}, "outputs": [], "source": [ - "def write_ctm(path, the_list): \n", + "def write_ctm(path, the_list):\n", " outF = open(path, \"w\")\n", " for line in the_list:\n", " outF.write(line)\n", " outF.write(\"\\n\")\n", " outF.close()\n", - " \n", + "\n", "write_ctm(f\"{data_dir}/an4_diarize_test.ctm\", an4_diarize_test_ctm)" ] }, @@ -603,7 +600,7 @@ "from nemo.collections.asr.metrics.der import concat_perm_word_error_rate\n", "from nemo.collections.asr.metrics.wer import word_error_rate\n", "from nemo.collections.asr.parts.utils.diarization_utils import convert_word_dict_seq_to_text, convert_ctm_to_text\n", - "# Provide a list containing the paths to the reference CTM files \n", + "# Provide a list containing the paths to the reference CTM files\n", "# which have the same order with filenames in word_seq_lists.\n", "\n", "word_seq_list = trans_info_dict['an4_diarize_test']['words']\n", @@ -633,7 +630,7 @@ "metadata": {}, "outputs": [], "source": [ - "from nemo.collections.asr.metrics.der import concat_perm_word_error_rate \n", + "from nemo.collections.asr.metrics.der import concat_perm_word_error_rate\n", "from nemo.collections.asr.metrics.wer import word_error_rate\n", "\n", "cpWER, concat_hyp, concat_ref = concat_perm_word_error_rate([spk_hypothesis], [spk_reference])\n", @@ -711,14 +708,14 @@ "metadata": {}, "outputs": [], "source": [ - "# Create a new manifest file for input with the reference CTM file. \n", + "# Create a new manifest file for input with the reference CTM file.\n", "meta = {\n", - " 'audio_filepath': AUDIO_FILENAME, \n", - " 'offset': 0, \n", - " 'duration':None, \n", - " 'label': 'infer', \n", - " 'text': '-', \n", - " 'num_speakers': 2, \n", + " 'audio_filepath': AUDIO_FILENAME,\n", + " 'offset': 0,\n", + " 'duration':None,\n", + " 'label': 'infer',\n", + " 'text': '-',\n", + " 'num_speakers': 2,\n", " 'rttm_filepath': None,\n", " 'ctm_filepath': f\"{data_dir}/an4_diarize_test.ctm\",\n", " 'uem_filepath' : None\n", @@ -731,7 +728,7 @@ "cfg.diarizer.manifest_filepath = os.path.join(data_dir,'input_manifest.json')\n", "!cat {cfg.diarizer.manifest_filepath}\n", "\n", - "# We need to call `make_file_lists` again to update manifest file to `asr_diar_offline` instance \n", + "# We need to call `make_file_lists` again to update manifest file to `asr_diar_offline` instance\n", "asr_diar_offline.make_file_lists()" ] }, @@ -799,7 +796,7 @@ " shutil.copyfileobj(f_in, f_out)\n", " f_in.close()\n", " f_out.close()\n", - " \n", + "\n", "ARPA_URL = 'https://kaldi-asr.org/models/5/4gram_big.arpa.gz'\n", "f = wget.download(ARPA_URL, data_dir)\n", "gunzip(f,f.replace(\".gz\",\"\"))" diff --git a/tutorials/speaker_tasks/End_to_End_Diarization_Inference.ipynb b/tutorials/speaker_tasks/End_to_End_Diarization_Inference.ipynb index 273cc00c1f56..c2c46674582a 100644 --- a/tutorials/speaker_tasks/End_to_End_Diarization_Inference.ipynb +++ b/tutorials/speaker_tasks/End_to_End_Diarization_Inference.ipynb @@ -25,10 +25,7 @@ "\n", "# ## Install NeMo\n", "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n", - "\n", - "## Install TorchAudio\n", - "!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html" + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]" ] }, { @@ -136,7 +133,7 @@ "import librosa\n", "\n", "sr = 16000\n", - "signal, sr = librosa.load(an4_audio,sr=sr) \n", + "signal, sr = librosa.load(an4_audio,sr=sr)\n", "\n", "fig,ax = plt.subplots(1,1)\n", "fig.set_figwidth(20)\n", @@ -186,7 +183,7 @@ "import torch\n", "\n", "if get_hf_token() is not None and get_hf_token().startswith(\"hf_\"):\n", - " # If you have logged into HuggingFace hub and have access token \n", + " # If you have logged into HuggingFace hub and have access token\n", " diar_model = SortformerEncLabelModel.from_pretrained(\"nvidia/diar_sortformer_4spk-v1\")\n", "else:\n", " # You can downloaded \".nemo\" file from https://huggingface.co/nvidia/diar_sortformer_4spk-v1 and specify the path.\n", @@ -225,9 +222,9 @@ "\n", " yticklabels = [\"spk0\", \"spk1\", \"spk2\", \"spk3\"]\n", " yticks = np.arange(len(yticklabels))\n", - " fig, axs = plt.subplots(1, 1, figsize=(30, 3)) \n", + " fig, axs = plt.subplots(1, 1, figsize=(30, 3))\n", "\n", - " axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest') \n", + " axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest')\n", " axs.set_title('Predictions', fontsize=FS)\n", " axs.set_xticks(np.arange(-.5, preds_mat.shape[1], 1), minor=True)\n", " axs.set_yticks(yticks)\n", @@ -235,7 +232,7 @@ " axs.set_xlabel(f\"80 ms Frames\", fontsize=FS)\n", " axs.grid(which='minor', color=grid_color_p, linestyle='-', linewidth=LW)\n", "\n", - " plt.savefig('plot.png', dpi=300) \n", + " plt.savefig('plot.png', dpi=300)\n", " plt.show()\n", "\n", "\n", @@ -297,7 +294,7 @@ "source": [ "from nemo.collections.asr.parts.utils.vad_utils import load_postprocessing_from_yaml\n", "import json\n", - "from omegaconf import OmegaConf \n", + "from omegaconf import OmegaConf\n", "post_processing_params = load_postprocessing_from_yaml(MODEL_CONFIG)\n", "print(json.dumps(OmegaConf.to_container(post_processing_params), indent=4))" ] diff --git a/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb b/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb index cf3416c4ad02..7d46393d0706 100644 --- a/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb +++ b/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb @@ -25,10 +25,7 @@ "\n", "# ## Install NeMo\n", "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n", - "\n", - "## Install TorchAudio\n", - "!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html" + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]" ] }, { @@ -249,7 +246,7 @@ "import librosa\n", "\n", "sr = 16000\n", - "signal, sr = librosa.load(an4_audio,sr=sr) \n", + "signal, sr = librosa.load(an4_audio,sr=sr)\n", "\n", "fig,ax = plt.subplots(1,1)\n", "fig.set_figwidth(20)\n", @@ -341,18 +338,18 @@ "metadata": {}, "outputs": [], "source": [ - "# Create a manifest for input with below format. \n", - "# {'audio_filepath': /path/to/audio_file, 'offset': 0, 'duration':None, 'label': 'infer', 'text': '-', \n", + "# Create a manifest for input with below format.\n", + "# {'audio_filepath': /path/to/audio_file, 'offset': 0, 'duration':None, 'label': 'infer', 'text': '-',\n", "# 'num_speakers': None, 'rttm_filepath': /path/to/rttm/file, 'uem_filepath'='/path/to/uem/filepath'}\n", "import json\n", "meta = {\n", - " 'audio_filepath': an4_audio, \n", - " 'offset': 0, \n", - " 'duration':None, \n", - " 'label': 'infer', \n", - " 'text': '-', \n", - " 'num_speakers': 2, \n", - " 'rttm_filepath': an4_rttm, \n", + " 'audio_filepath': an4_audio,\n", + " 'offset': 0,\n", + " 'duration':None,\n", + " 'label': 'infer',\n", + " 'text': '-',\n", + " 'num_speakers': 2,\n", + " 'rttm_filepath': an4_rttm,\n", " 'uem_filepath' : None\n", "}\n", "with open('data/input_manifest.json','w') as fp:\n", @@ -426,10 +423,10 @@ "config.diarizer.out_dir = output_dir # Directory to store intermediate files and prediction outputs\n", "pretrained_speaker_model = 'titanet_large'\n", "config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model\n", - "config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5] \n", - "config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1] \n", - "config.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1] \n", - "config.diarizer.oracle_vad = True # ----> ORACLE VAD \n", + "config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5]\n", + "config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1]\n", + "config.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1]\n", + "config.diarizer.oracle_vad = True # ----> ORACLE VAD\n", "config.diarizer.clustering.parameters.oracle_num_speakers = False" ] }, @@ -536,7 +533,7 @@ "metadata": {}, "outputs": [], "source": [ - "config.diarizer.msdd_model.model_path = 'diar_msdd_telephonic' # Telephonic speaker diarization model \n", + "config.diarizer.msdd_model.model_path = 'diar_msdd_telephonic' # Telephonic speaker diarization model\n", "config.diarizer.msdd_model.parameters.sigmoid_threshold = [0.7, 1.0] # Evaluate with T=0.7 and T=1.0" ] }, @@ -692,7 +689,7 @@ "metadata": {}, "outputs": [], "source": [ - "config.num_workers = 1 # Workaround for multiprocessing hanging with ipython issue \n", + "config.num_workers = 1 # Workaround for multiprocessing hanging with ipython issue\n", "\n", "output_dir = os.path.join(ROOT, 'outputs')\n", "config.diarizer.manifest_filepath = 'data/input_manifest.json'\n", @@ -784,10 +781,10 @@ "\n", "plot(\n", " an4_audio,\n", - " vad_output_filepath, \n", + " vad_output_filepath,\n", " an4_rttm,\n", " per_args = config.diarizer.vad.parameters, #threshold\n", - " ) \n", + " )\n", "\n", "print(f\"VAD params:{OmegaConf.to_yaml(config.diarizer.vad.parameters)}\")" ] @@ -857,7 +854,7 @@ "metadata": {}, "outputs": [], "source": [ - "config.diarizer.msdd_model.model_path = 'diar_msdd_telephonic' # Telephonic speaker diarization model \n", + "config.diarizer.msdd_model.model_path = 'diar_msdd_telephonic' # Telephonic speaker diarization model\n", "config.diarizer.msdd_model.parameters.sigmoid_threshold = [0.7, 1.0] # Evaluate with T=0.7 and T=1.0\n", "system_vad_msdd_model = NeuralDiarizer(cfg=config)" ] diff --git a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb index 48f78e0c1c8a..3db99889d92e 100644 --- a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb +++ b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb @@ -28,10 +28,7 @@ "\n", "## Install NeMo\n", "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n", - "\n", - "# Install TorchAudio\n", - "!pip install torchaudio>=0.10.0 -f https://download.pytorch.org/whl/torch_stable.html\n" + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]" ] }, { @@ -324,7 +321,7 @@ "outputs": [], "source": [ "# This line will print the entire config of sample TitaNet model\n", - "!mkdir conf \n", + "!mkdir conf\n", "!wget -P conf https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/speaker_tasks/recognition/conf/titanet-large.yaml\n", "MODEL_CONFIG = os.path.join(NEMO_ROOT,'conf/titanet-large.yaml')\n", "config = OmegaConf.load(MODEL_CONFIG)\n", @@ -1120,7 +1117,7 @@ "\n", " all_embs=[]\n", " out_embeddings = {}\n", - " \n", + "\n", " for test_batch in tqdm(speaker_model.test_dataloader()):\n", " test_batch = [x.to(device) for x in test_batch]\n", " audio_signal, audio_signal_len, labels, slices = test_batch\n", diff --git a/tutorials/speaker_tasks/Streaming_End_to_End_Diarization_Inference.ipynb b/tutorials/speaker_tasks/Streaming_End_to_End_Diarization_Inference.ipynb index 69aeb96c96c7..30431972d5f0 100644 --- a/tutorials/speaker_tasks/Streaming_End_to_End_Diarization_Inference.ipynb +++ b/tutorials/speaker_tasks/Streaming_End_to_End_Diarization_Inference.ipynb @@ -25,10 +25,7 @@ "\n", "# ## Install NeMo\n", "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]\n", - "\n", - "## Install TorchAudio\n", - "!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html" + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[asr]" ] }, { @@ -132,7 +129,7 @@ "import librosa\n", "\n", "sr = 16000\n", - "signal, sr = librosa.load(an4_audio,sr=sr) \n", + "signal, sr = librosa.load(an4_audio,sr=sr)\n", "\n", "fig,ax = plt.subplots(1,1)\n", "fig.set_figwidth(20)\n", @@ -176,7 +173,7 @@ "import torch\n", "\n", "if get_hf_token() is not None and get_hf_token().startswith(\"hf_\"):\n", - " # If you have logged into HuggingFace hub and have access token \n", + " # If you have logged into HuggingFace hub and have access token\n", " diar_model = SortformerEncLabelModel.from_pretrained(\"nvidia/diar_streaming_sortformer_4spk-v2\")\n", "else:\n", " # You can downloaded \".nemo\" file from https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2 and specify the path.\n", @@ -209,9 +206,9 @@ "\n", " yticklabels = [\"spk0\", \"spk1\", \"spk2\", \"spk3\"]\n", " yticks = np.arange(len(yticklabels))\n", - " fig, axs = plt.subplots(1, 1, figsize=(30, 3)) \n", + " fig, axs = plt.subplots(1, 1, figsize=(30, 3))\n", "\n", - " axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest') \n", + " axs.imshow(preds_mat, cmap=cmap_str, interpolation='nearest')\n", " axs.set_title('Predictions', fontsize=FS)\n", " axs.set_xticks(np.arange(-.5, preds_mat.shape[1], 1), minor=True)\n", " axs.set_yticks(yticks)\n", @@ -219,7 +216,7 @@ " axs.set_xlabel(f\"80 ms Frames\", fontsize=FS)\n", " axs.grid(which='minor', color=grid_color_p, linestyle='-', linewidth=LW)\n", "\n", - " plt.savefig('plot.png', dpi=300) \n", + " plt.savefig('plot.png', dpi=300)\n", " plt.show()" ] }, @@ -256,7 +253,7 @@ "import math\n", "import torch\n", "import torch.amp\n", - "from tqdm import tqdm \n", + "from tqdm import tqdm\n", "\n", "# If cuda is available, assign the model to cuda\n", "if torch.cuda.is_available():\n", @@ -374,7 +371,7 @@ " )\n", " # plot the predictions\n", " plot_preds[:,:total_preds.shape[1]] = total_preds\n", - " plot_diarout(plot_preds[0,:]) \n", + " plot_diarout(plot_preds[0,:])\n", " time.sleep(chunk_duration_seconds)" ] }