ailia-models/audio_processing/whisper/ailia_audio_utils.py at master · ailia-ai/ailia-models · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import numpy as np
import ailia.audio
import soundfile as sf

# hard-coded audio hyperparameters
SAMPLE_RATE = 16000
N_FFT = 400
HOP_LENGTH = 160
CHUNK_LENGTH = 30
N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk
N_FRAMES = (N_SAMPLES // HOP_LENGTH)  # 3000: number of frames in a mel spectrogram input


def load_audio(file: str, sr: int = SAMPLE_RATE):
    # prepare input data
    wav, source_sr = sf.read(file)
    # convert to mono
    if len(wav.shape) >= 2 and wav.shape[1] == 2:
        wav = np.mean(wav, axis=1)
    # Resample the wav if needed
    if source_sr is not None and source_sr != sr:
        wav = ailia.audio.resample(wav, org_sr=source_sr, target_sr=sr)
    return wav


def pad_or_trim(array, length=N_SAMPLES, axis=-1):
    """
    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
    """
    if array.shape[axis] > length:
        array = array.take(indices=range(length), axis=axis)

    if array.shape[axis] < length:
        pad_widths = [(0, 0)] * array.ndim
        pad_widths[axis] = (0, length - array.shape[axis])
        array = np.pad(array, pad_widths)

    return array


def log_mel_spectrogram(audio, n_mels: int = 80, padding: int = 0):
    """
    Compute the log-Mel spectrogram of

    Parameters
    ----------
    audio: np.ndarray
    n_mels: int
        The number of Mel-frequency filters, only 80 is supported
    padding: int
        Number of zero samples to pad to the right

    Returns
    -------
    A Tensor that contains the Mel spectrogram, shape = (80, n_frames)
    """
    if padding > 0:
        audio = np.pad(audio, (0, padding))

    mel_spec = ailia.audio.mel_spectrogram(
        audio, sample_rate=SAMPLE_RATE, fft_n=N_FFT, hop_n=HOP_LENGTH,
        win_type="hann", center_mode=1, power=2.0, mel_n=n_mels)

    log_spec = np.log10(np.clip(mel_spec, 1e-10, None))
    log_spec = np.maximum(log_spec, np.max(log_spec) - 8.0)
    log_spec = (log_spec + 4.0) / 4.0

    return log_spec