Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions docker/Dockerfile.speech
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chtruong814 could you help review the docker file changes.

Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
FROM ${BASE_IMAGE} as nemo-deps

# dependency flags; should be declared after FROM
# torchaudio: not required by default
ARG REQUIRE_TORCHAUDIO=false
# k2: not required by default
ARG REQUIRE_K2=false
# ais cli: not required by default, install only if required
Expand Down Expand Up @@ -96,18 +94,6 @@ WORKDIR /tmp/
# uninstall stuff from base container
RUN pip3 uninstall -y sacrebleu torchtext

# build torchaudio
WORKDIR /tmp/torchaudio_build
COPY scripts/installers /tmp/torchaudio_build/scripts/installers/
RUN INSTALL_MSG=$(/bin/bash /tmp/torchaudio_build/scripts/installers/install_torchaudio_latest.sh); INSTALL_CODE=$?; \
echo ${INSTALL_MSG}; \
if [ ${INSTALL_CODE} -ne 0 ]; then \
echo "torchaudio installation failed"; \
if [ "${REQUIRE_TORCHAUDIO}" = true ]; then \
exit ${INSTALL_CODE}; \
else echo "Skipping failed torchaudio installation"; fi \
else echo "torchaudio installed successfully"; fi

COPY scripts /tmp/nemo/scripts/
# install correct graphviz version (k2 and pynini visualization tool), skip if installation fails
RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_graphviz.sh --docker); INSTALL_CODE=$?; \
Expand Down
2 changes: 1 addition & 1 deletion examples/audio/process_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import torch
from omegaconf import OmegaConf

from nemo.collections.audio.models import AudioToAudioModel
from nemo.collections.audio.models.audio_to_audio import AudioToAudioModel
from nemo.core.config import hydra_runner
from nemo.utils import logging, model_utils

Expand Down
38 changes: 6 additions & 32 deletions nemo/collections/asr/modules/audio_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
from typing import Any, Optional

import torch
from packaging import version

from nemo.collections.asr.parts.numba.spec_augment import SpecAugmentNumba, spec_augment_launch_heuristics
from nemo.collections.asr.parts.preprocessing.features import FilterbankFeatures, FilterbankFeaturesTA
from nemo.collections.asr.parts.preprocessing.features import FilterbankFeatures
from nemo.collections.asr.parts.submodules.spectr_augment import SpecAugment, SpecCutout
from nemo.collections.audio.parts.utils.transforms import MFCC
from nemo.core.classes import Exportable, NeuralModule, typecheck
from nemo.core.neural_types import (
AudioSignal,
Expand All @@ -37,18 +37,6 @@
from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__
from nemo.utils import logging, logging_mode

try:
import torchaudio
import torchaudio.functional
import torchaudio.transforms

TORCHAUDIO_VERSION = version.parse(torchaudio.__version__)
TORCHAUDIO_VERSION_MIN = version.parse('0.5')

HAVE_TORCHAUDIO = True
except ModuleNotFoundError:
HAVE_TORCHAUDIO = False

__all__ = [
'AudioToMelSpectrogramPreprocessor',
'AudioToMFCCPreprocessor',
Expand Down Expand Up @@ -171,7 +159,6 @@ class AudioToMelSpectrogramPreprocessor(AudioPreprocessor, Exportable):
Defaults to 0.0
nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation.
Defaults to 4000
use_torchaudio: Whether to use the `torchaudio` implementation.
mel_norm: Normalization used for mel filterbank weights.
Defaults to 'slaney' (area normalization)
stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints.
Expand Down Expand Up @@ -237,8 +224,8 @@ def __init__(
rng=None,
nb_augmentation_prob=0.0,
nb_max_freq=4000,
use_torchaudio: bool = False,
mel_norm="slaney",
use_torchaudio: bool = False, # Deprecated arguments; kept for config compatibility
stft_exact_pad=False, # Deprecated arguments; kept for config compatibility
stft_conv=False, # Deprecated arguments; kept for config compatibility
):
Expand All @@ -256,11 +243,7 @@ def __init__(
super().__init__(n_window_size, n_window_stride)

# Given the long and similar argument list, point to the class and instantiate it by reference
if not use_torchaudio:
featurizer_class = FilterbankFeatures
else:
featurizer_class = FilterbankFeaturesTA
self.featurizer = featurizer_class(
self.featurizer = FilterbankFeatures(
sample_rate=self._sample_rate,
n_window_size=n_window_size,
n_window_stride=n_window_stride,
Expand Down Expand Up @@ -306,7 +289,6 @@ def filter_banks(self):

class AudioToMFCCPreprocessor(AudioPreprocessor):
"""Preprocessor that converts wavs to MFCCs.
Uses torchaudio.transforms.MFCC.

Args:
sample_rate: The sample rate of the audio.
Expand Down Expand Up @@ -382,14 +364,6 @@ def __init__(
log=True,
):
self._sample_rate = sample_rate
if not HAVE_TORCHAUDIO:
logging.error('Could not import torchaudio. Some features might not work.')

raise ModuleNotFoundError(
"torchaudio is not installed but is necessary for "
"AudioToMFCCPreprocessor. We recommend you try "
"building it from source for the PyTorch version you have."
)
if window_size and n_window_size:
raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
if window_stride and n_window_stride:
Expand Down Expand Up @@ -425,7 +399,7 @@ def __init__(
mel_kwargs['window_fn'] = window_fn

# Use torchaudio's implementation of MFCCs as featurizer
self.featurizer = torchaudio.transforms.MFCC(
self.featurizer = MFCC(
sample_rate=self._sample_rate,
n_mfcc=n_mfcc,
dct_type=dct_type,
Expand Down Expand Up @@ -746,8 +720,8 @@ class AudioToMelSpectrogramPreprocessorConfig:
rng: Optional[str] = None
nb_augmentation_prob: float = 0.0
nb_max_freq: int = 4000
use_torchaudio: bool = False
mel_norm: str = "slaney"
use_torchaudio: bool = False # Deprecated argument, kept for compatibility with older checkpoints.
stft_exact_pad: bool = False # Deprecated argument, kept for compatibility with older checkpoints.
stft_conv: bool = False # Deprecated argument, kept for compatibility with older checkpoints.

Expand Down
193 changes: 0 additions & 193 deletions nemo/collections/asr/parts/preprocessing/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
# This file contains code artifacts adapted from https://github.com/ryanleary/patter
import math
import random
from typing import Optional, Tuple, Union

import librosa
import numpy as np
Expand All @@ -45,14 +44,6 @@
from nemo.collections.asr.parts.preprocessing.segment import AudioSegment
from nemo.utils import logging

try:
import torchaudio

HAVE_TORCHAUDIO = True
except ModuleNotFoundError:
HAVE_TORCHAUDIO = False


CONSTANT = 1e-5


Expand Down Expand Up @@ -499,187 +490,3 @@ def forward(self, x, seq_len, linear_spec=False):
if pad_amt != 0:
x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value)
return x, seq_len


class FilterbankFeaturesTA(nn.Module):
"""
Exportable, `torchaudio`-based implementation of Mel Spectrogram extraction.

See `AudioToMelSpectrogramPreprocessor` for args.

"""

def __init__(
self,
sample_rate: int = 16000,
n_window_size: int = 320,
n_window_stride: int = 160,
normalize: Optional[str] = "per_feature",
nfilt: int = 64,
n_fft: Optional[int] = None,
preemph: float = 0.97,
lowfreq: float = 0,
highfreq: Optional[float] = None,
log: bool = True,
log_zero_guard_type: str = "add",
log_zero_guard_value: Union[float, str] = 2**-24,
dither: float = 1e-5,
window: str = "hann",
pad_to: int = 0,
pad_value: float = 0.0,
mel_norm="slaney",
# Seems like no one uses these options anymore. Don't convolute the code by supporting thm.
use_grads: bool = False, # Deprecated arguments; kept for config compatibility
max_duration: float = 16.7, # Deprecated arguments; kept for config compatibility
frame_splicing: int = 1, # Deprecated arguments; kept for config compatibility
exact_pad: bool = False, # Deprecated arguments; kept for config compatibility
nb_augmentation_prob: float = 0.0, # Deprecated arguments; kept for config compatibility
nb_max_freq: int = 4000, # Deprecated arguments; kept for config compatibility
mag_power: float = 2.0, # Deprecated arguments; kept for config compatibility
rng: Optional[random.Random] = None, # Deprecated arguments; kept for config compatibility
stft_exact_pad: bool = False, # Deprecated arguments; kept for config compatibility
stft_conv: bool = False, # Deprecated arguments; kept for config compatibility
):
super().__init__()
if not HAVE_TORCHAUDIO:
raise ValueError(f"Need to install torchaudio to instantiate a {self.__class__.__name__}")

# Make sure log zero guard is supported, if given as a string
supported_log_zero_guard_strings = {"eps", "tiny"}
if isinstance(log_zero_guard_value, str) and log_zero_guard_value not in supported_log_zero_guard_strings:
raise ValueError(
f"Log zero guard value must either be a float or a member of {supported_log_zero_guard_strings}"
)

# Copied from `AudioPreprocessor` due to the ad-hoc structuring of the Mel Spec extractor class
self.torch_windows = {
'hann': torch.hann_window,
'hamming': torch.hamming_window,
'blackman': torch.blackman_window,
'bartlett': torch.bartlett_window,
'ones': torch.ones,
None: torch.ones,
}

# Ensure we can look up the window function
if window not in self.torch_windows:
raise ValueError(f"Got window value '{window}' but expected a member of {self.torch_windows.keys()}")

self.win_length = n_window_size
self.hop_length = n_window_stride
self._sample_rate = sample_rate
self._normalize_strategy = normalize
self._use_log = log
self._preemphasis_value = preemph
self.log_zero_guard_type = log_zero_guard_type
self.log_zero_guard_value: Union[str, float] = log_zero_guard_value
self.dither = dither
self.pad_to = pad_to
self.pad_value = pad_value
self.n_fft = n_fft
self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=self._sample_rate,
win_length=self.win_length,
hop_length=self.hop_length,
n_mels=nfilt,
window_fn=self.torch_windows[window],
mel_scale="slaney",
norm=mel_norm,
n_fft=n_fft,
f_max=highfreq,
f_min=lowfreq,
wkwargs={"periodic": False},
)

@property
def filter_banks(self):
"""Matches the analogous class"""
return self._mel_spec_extractor.mel_scale.fb

def _resolve_log_zero_guard_value(self, dtype: torch.dtype) -> float:
if isinstance(self.log_zero_guard_value, float):
return self.log_zero_guard_value
return getattr(torch.finfo(dtype), self.log_zero_guard_value)

def _apply_dithering(self, signals: torch.Tensor) -> torch.Tensor:
if self.training and self.dither > 0.0:
noise = torch.randn_like(signals) * self.dither
signals = signals + noise
return signals

def _apply_preemphasis(self, signals: torch.Tensor) -> torch.Tensor:
if self._preemphasis_value is not None:
padded = torch.nn.functional.pad(signals, (1, 0))
signals = signals - self._preemphasis_value * padded[:, :-1]
return signals

def _compute_output_lengths(self, input_lengths: torch.Tensor) -> torch.Tensor:
out_lengths = input_lengths.div(self.hop_length, rounding_mode="floor").add(1).long()
return out_lengths

def _apply_pad_to(self, features: torch.Tensor) -> torch.Tensor:
# Only apply during training; else need to capture dynamic shape for exported models
if not self.training or self.pad_to == 0 or features.shape[-1] % self.pad_to == 0:
return features
pad_length = self.pad_to - (features.shape[-1] % self.pad_to)
return torch.nn.functional.pad(features, pad=(0, pad_length), value=self.pad_value)

def _apply_log(self, features: torch.Tensor) -> torch.Tensor:
if self._use_log:
zero_guard = self._resolve_log_zero_guard_value(features.dtype)
if self.log_zero_guard_type == "add":
features = features + zero_guard
elif self.log_zero_guard_type == "clamp":
features = features.clamp(min=zero_guard)
else:
raise ValueError(f"Unsupported log zero guard type: '{self.log_zero_guard_type}'")
features = features.log()
return features

def _extract_spectrograms(self, signals: torch.Tensor) -> torch.Tensor:
# Complex FFT needs to be done in single precision
with torch.amp.autocast('cuda', enabled=False):
features = self._mel_spec_extractor(waveform=signals)
return features

def _apply_normalization(self, features: torch.Tensor, lengths: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
# For consistency, this function always does a masked fill even if not normalizing.
mask: torch.Tensor = make_seq_mask_like(lengths=lengths, like=features, time_dim=-1, valid_ones=False)
features = features.masked_fill(mask, 0.0)
# Maybe don't normalize
if self._normalize_strategy is None:
return features
# Use the log zero guard for the sqrt zero guard
guard_value = self._resolve_log_zero_guard_value(features.dtype)
if self._normalize_strategy == "per_feature" or self._normalize_strategy == "all_features":
# 'all_features' reduces over each sample; 'per_feature' reduces over each channel
reduce_dim = 2
if self._normalize_strategy == "all_features":
reduce_dim = [1, 2]
# [B, D, T] -> [B, D, 1] or [B, 1, 1]
means = features.sum(dim=reduce_dim, keepdim=True).div(lengths.view(-1, 1, 1))
stds = (
features.sub(means)
.masked_fill(mask, 0.0)
.pow(2.0)
.sum(dim=reduce_dim, keepdim=True) # [B, D, T] -> [B, D, 1] or [B, 1, 1]
.div(lengths.view(-1, 1, 1) - 1) # assume biased estimator
.clamp(min=guard_value) # avoid sqrt(0)
.sqrt()
)
features = (features - means) / (stds + eps)
else:
# Deprecating constant std/mean
raise ValueError(f"Unsupported norm type: '{self._normalize_strategy}")
features = features.masked_fill(mask, 0.0)
return features

def forward(self, input_signal: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
feature_lengths = self._compute_output_lengths(input_lengths=length)
signals = self._apply_dithering(signals=input_signal)
signals = self._apply_preemphasis(signals=signals)
features = self._extract_spectrograms(signals=signals)
features = self._apply_log(features=features)
features = self._apply_normalization(features=features, lengths=feature_lengths)
features = self._apply_pad_to(features=features)
return features, feature_lengths
2 changes: 0 additions & 2 deletions nemo/collections/audio/losses/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo.collections.audio.losses.audio import MAELoss, MSELoss, SDRLoss
17 changes: 2 additions & 15 deletions nemo/collections/audio/losses/maxine/losses_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,8 @@

import torch

try:
from torchaudio.functional import resample
from torchaudio.transforms import MelSpectrogram

HAVE_TORCHAUDIO = True
except ModuleNotFoundError:
HAVE_TORCHAUDIO = False

from nemo.collections.asr.models import ASRModel
from nemo.collections.asr.models.asr_model import ASRModel
from nemo.collections.audio.parts.utils.transforms import MelSpectrogram, resample
from nemo.core import Loss, Typing, typecheck
from nemo.core.neural_types import LengthsType, LossType, NeuralType, VoidType
from nemo.utils import logging
Expand Down Expand Up @@ -94,12 +87,6 @@ def __init__(
conformer_model=STT_EN_CONFORMER_CTC_SMALL_v1_6_0,
epsilon=float(5.9604644775390625e-8),
):
if not HAVE_TORCHAUDIO:
logging.error('Could not import torchaudio. Some features might not work.')

raise ModuleNotFoundError(
f"torchaudio is not installed but is necessary to instantiate a {self.__class__.__name__}"
)

super().__init__()
self.sample_rate = sample_rate
Expand Down
Loading
Loading