diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index 00558769b020..d30e5172a7b4 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -56,10 +56,11 @@ except ModuleNotFoundError: HAVE_PYDUB = False - +import librosa available_formats = sf.available_formats() sf_supported_formats = ["." + i.lower() for i in available_formats.keys()] +sf_supported_formats.append(".opus") ChannelSelectorType = Union[int, Iterable[int], str] @@ -347,11 +348,40 @@ def from_file( except RuntimeError as e: logging.error( f"Loading {audio_file} via SoundFile raised RuntimeError: `{e}`. " - f"NeMo will fallback to loading via pydub." + f"NeMo will fallback to loading via Librosa." ) if hasattr(audio_file, "seek"): audio_file.seek(0) + if samples is None: + try: + dtype = 'int32' if int_values else 'float32' + lr_sr = None if target_sr is None else target_sr + + y, sr = librosa.load( + audio_file, + sr=lr_sr, + mono=False, + offset=(offset if offset is not None else 0.0), + duration=(duration if duration is not None and duration > 0 else None), + ) + + if y.ndim == 1: + samples = y.astype('float32') + else: + + samples = y.T.astype('float32') + if int_values: + + samples = (samples * (2**31)).astype('int32') + sample_rate = sr + print("Loaded using librosa") + except Exception as e: + logging.error( + f"Loading {audio_file} via librosa raised Exception: `{e}`. " + "NeMo will fallback to loading via pydub if available." + ) + samples = None if HAVE_PYDUB and samples is None: try: