Skip to content

Commit b8a7a1b

Browse files
committed
Remove librosa dependency from package
1 parent f2e319a commit b8a7a1b

File tree

8 files changed

+179
-16
lines changed

8 files changed

+179
-16
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
## [1.1.2] - 2022-09-29
2+
### Changed
3+
- Removed `Librosa` library dependency in requirements, now it is optional and required only with modules that use librosa
4+
5+
### Added
6+
- Created `Tutorials.05_sound_to_text.train_no_limit.py` that demonstrates how to train audio recognition model with `mltu` without audio length limit
7+
18
## [1.1.1] - 2022-09-26
29
### Changed
310
- Included `self._executor` as generator in `mltu.dataProvider.DataProvider` object, to enable functionality to modify batch preprocessing without changing original code

Tutorials/05_sound_to_text/model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2):
99

10-
inputs = layers.Input(shape=input_dim, name="input")
10+
inputs = layers.Input(shape=input_dim, name="input", dtype=tf.float32)
1111

1212
# expand dims to add channel dimension
1313
input = layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(inputs)
@@ -46,7 +46,7 @@ def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2):
4646
x = layers.Dropout(dropout)(x)
4747

4848
# Classification layer
49-
output = layers.Dense(output_dim + 1, activation="softmax")(x)
49+
output = layers.Dense(output_dim + 1, activation="softmax", dtype=tf.float32)(x)
5050

5151
model = Model(inputs=inputs, outputs=output)
5252
return model
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import tensorflow as tf
2+
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
3+
except: pass
4+
tf.keras.mixed_precision.set_global_policy('mixed_float16') # mixed precission training for faster training time
5+
6+
import os
7+
import tarfile
8+
import pandas as pd
9+
from tqdm import tqdm
10+
from urllib.request import urlopen
11+
from io import BytesIO
12+
13+
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
14+
from mltu.preprocessors import WavReader
15+
16+
from mltu.tensorflow.dataProvider import DataProvider
17+
from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding
18+
from mltu.tensorflow.losses import CTCloss
19+
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
20+
from mltu.tensorflow.metrics import CERMetric, WERMetric
21+
22+
from model import train_model
23+
from configs import ModelConfigs
24+
25+
26+
def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
27+
http_response = urlopen(url)
28+
29+
data = b""
30+
iterations = http_response.length // chunk_size + 1
31+
for _ in tqdm(range(iterations)):
32+
data += http_response.read(chunk_size)
33+
34+
tarFile = tarfile.open(fileobj=BytesIO(data), mode="r|bz2")
35+
tarFile.extractall(path=extract_to)
36+
tarFile.close()
37+
38+
39+
dataset_path = os.path.join("Datasets", "LJSpeech-1.1")
40+
if not os.path.exists(dataset_path):
41+
download_and_unzip("https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", extract_to="Datasets")
42+
43+
dataset_path = "Datasets/LJSpeech-1.1"
44+
metadata_path = dataset_path + "/metadata.csv"
45+
wavs_path = dataset_path + "/wavs/"
46+
47+
# Read metadata file and parse it
48+
metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
49+
metadata_df.columns = ["file_name", "transcription", "normalized_transcription"]
50+
metadata_df = metadata_df[["file_name", "normalized_transcription"]]
51+
52+
# structure the dataset where each row is a list of [wav_file_path, sound transcription]
53+
dataset = [[f"Datasets/LJSpeech-1.1/wavs/{file}.wav", label.lower()] for file, label in metadata_df.values.tolist()]
54+
55+
# Create a ModelConfigs object to store model configurations
56+
configs = ModelConfigs()
57+
configs.save()
58+
59+
# Create a data provider for the dataset
60+
data_provider = DataProvider(
61+
dataset=dataset,
62+
skip_validation=True,
63+
batch_size=configs.batch_size,
64+
data_preprocessors=[
65+
WavReader(frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length),
66+
],
67+
transformers=[
68+
LabelIndexer(configs.vocab),
69+
],
70+
batch_postprocessors=[
71+
SpectrogramPadding(padding_value=0, use_on_batch=True),
72+
LabelPadding(padding_value=len(configs.vocab), use_on_batch=True),
73+
],
74+
)
75+
76+
# Split the dataset into training and validation sets
77+
train_data_provider, val_data_provider = data_provider.split(split = 0.9)
78+
79+
# Creating TensorFlow model architecture
80+
model = train_model(
81+
input_dim = (None, 193),
82+
output_dim = len(configs.vocab),
83+
dropout=0.5
84+
)
85+
86+
# Compile the model and print summary
87+
model.compile(
88+
optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate),
89+
loss=CTCloss(),
90+
metrics=[
91+
CERMetric(vocabulary=configs.vocab),
92+
WERMetric(vocabulary=configs.vocab)
93+
],
94+
run_eagerly=False
95+
)
96+
model.summary(line_length=110)
97+
98+
# Define callbacks
99+
earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min")
100+
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
101+
trainLogger = TrainLogger(configs.model_path)
102+
tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
103+
reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.8, min_delta=1e-10, patience=5, verbose=1, mode="auto")
104+
model2onnx = Model2onnx(f"{configs.model_path}/model.h5")
105+
106+
# Train the model
107+
model.fit(
108+
train_data_provider,
109+
validation_data=val_data_provider,
110+
epochs=configs.train_epochs,
111+
callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
112+
workers=configs.train_workers,
113+
)
114+
115+
# Save training and validation datasets as csv files
116+
train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
117+
val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))

mltu/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.1.1"
1+
__version__ = "1.1.2"
22

33
from .annotations.images import Image
44
from .annotations.images import CVImage

mltu/dataProvider.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,6 @@ def executor(batch_data):
216216

217217
def __iter__(self):
218218
""" Create a generator that iterate over the Sequence."""
219-
self.start_executor()
220219
for index in range(len(self)):
221220
results = self[index]
222221
yield results
@@ -269,6 +268,9 @@ def __getitem__(self, index: int):
269268
Returns:
270269
tuple: batch of data and batch of annotations
271270
"""
271+
if index==0:
272+
self.start_executor()
273+
272274
dataset_batch = self.get_batch_annotations(index)
273275

274276
# First read and preprocess the batch data

mltu/preprocessors.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import os
22
import typing
3-
import librosa
4-
import librosa.display
53
import numpy as np
64
import matplotlib.pyplot as plt
75
import matplotlib
@@ -106,6 +104,11 @@ class WavReader:
106104
frame_step (int): Step size between frames in samples.
107105
fft_length (int): Number of FFT components.
108106
"""
107+
try:
108+
import librosa
109+
except ImportError:
110+
raise ImportError("librosa is required to read Audio. Please install it with `pip install librosa`.")
111+
109112
def __init__(
110113
self,
111114
frame_length: int = 256,
@@ -133,12 +136,12 @@ def get_spectrogram(wav_path: str, frame_length: int, frame_step: int, fft_lengt
133136
np.ndarray: Spectrogram of the WAV file.
134137
"""
135138
# Load the wav file and store the audio data in the variable 'audio' and the sample rate in 'orig_sr'
136-
audio, orig_sr = librosa.load(wav_path)
139+
audio, orig_sr = WavReader.librosa.load(wav_path)
137140

138141
# Compute the Short Time Fourier Transform (STFT) of the audio data and store it in the variable 'spectrogram'
139142
# The STFT is computed with a hop length of 'frame_step' samples, a window length of 'frame_length' samples, and 'fft_length' FFT components.
140143
# The resulting spectrogram is also transposed for convenience
141-
spectrogram = librosa.stft(audio, hop_length=frame_step, win_length=frame_length, n_fft=fft_length).T
144+
spectrogram = WavReader.librosa.stft(audio, hop_length=frame_step, win_length=frame_length, n_fft=fft_length).T
142145

143146
# Take the absolute value of the spectrogram to obtain the magnitude spectrum
144147
spectrogram = np.abs(spectrogram)
@@ -162,7 +165,7 @@ def plot_raw_audio(wav_path: str, title: str = None, sr: int = 16000) -> None:
162165
title (str, optional): Title
163166
"""
164167
# Load the wav file and store the audio data in the variable 'audio' and the sample rate in 'orig_sr'
165-
audio, orig_sr = librosa.load(wav_path, sr=sr)
168+
audio, orig_sr = WavReader.librosa.load(wav_path, sr=sr)
166169

167170
duration = len(audio) / orig_sr
168171

mltu/transformers.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -137,18 +137,34 @@ class LabelPadding(Transformer):
137137
"""Pad label to max_word_length
138138
139139
Attributes:
140-
max_word_length (int): Maximum length of label
141140
padding_value (int): Value to pad
141+
max_word_length (int): Maximum length of label
142+
use_on_batch (bool): Whether to use on batch. Default: False
142143
"""
143144
def __init__(
144145
self,
145-
max_word_length: int,
146-
padding_value: int
146+
padding_value: int,
147+
max_word_length: int = None,
148+
use_on_batch: bool = False
147149
) -> None:
148150
self.max_word_length = max_word_length
149151
self.padding_value = padding_value
152+
self.use_on_batch = use_on_batch
153+
154+
if not use_on_batch and max_word_length is None:
155+
raise ValueError("max_word_length must be specified if use_on_batch is False")
150156

151157
def __call__(self, data: np.ndarray, label: np.ndarray):
158+
if self.use_on_batch:
159+
max_len = max([len(a) for a in label])
160+
padded_labels = []
161+
for l in label:
162+
padded_label = np.pad(l, (0, max_len - len(l)), "constant", constant_values=self.padding_value)
163+
padded_labels.append(padded_label)
164+
165+
padded_labels = np.array(padded_labels)
166+
return data, padded_labels
167+
152168
label = label[:self.max_word_length]
153169
return data, np.pad(label, (0, self.max_word_length - len(label)), "constant", constant_values=self.padding_value)
154170

@@ -157,22 +173,41 @@ class SpectrogramPadding(Transformer):
157173
"""Pad spectrogram to max_spectrogram_length
158174
159175
Attributes:
160-
max_spectrogram_length (int): Maximum length of spectrogram
161176
padding_value (int): Value to pad
177+
max_spectrogram_length (int): Maximum length of spectrogram. Must be specified if use_on_batch is False. Default: None
178+
use_on_batch (bool): Whether to use on batch. Default: False
162179
"""
163180
def __init__(
164181
self,
165-
max_spectrogram_length: int,
166-
padding_value: int
182+
padding_value: int,
183+
max_spectrogram_length: int = None,
184+
use_on_batch: bool = False
167185
) -> None:
168186
self.max_spectrogram_length = max_spectrogram_length
169187
self.padding_value = padding_value
188+
self.use_on_batch = use_on_batch
189+
190+
if not use_on_batch and max_spectrogram_length is None:
191+
raise ValueError("max_spectrogram_length must be specified if use_on_batch is False")
170192

171193
def __call__(self, spectrogram: np.ndarray, label: np.ndarray):
194+
if self.use_on_batch:
195+
max_len = max([len(a) for a in spectrogram])
196+
padded_spectrograms = []
197+
for spec in spectrogram:
198+
padded_spectrogram = np.pad(spec, ((0, max_len - spec.shape[0]), (0,0)), mode="constant", constant_values=self.padding_value)
199+
padded_spectrograms.append(padded_spectrogram)
200+
201+
padded_spectrograms = np.array(padded_spectrograms)
202+
label = np.array(label)
203+
204+
return padded_spectrograms, label
205+
172206
padded_spectrogram = np.pad(spectrogram, ((0, self.max_spectrogram_length - spectrogram.shape[0]),(0,0)), mode="constant", constant_values=self.padding_value)
173207

174208
return padded_spectrogram, label
175209

210+
176211
class AudioPadding(Transformer):
177212
def __init__(self, max_audio_length: int, padding_value: int = 0, use_on_batch: bool = False, limit: bool = False):
178213
super(AudioPadding, self).__init__()

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,4 @@ numpy
55
opencv-python
66
Pillow>=9.4.0
77
onnxruntime>=1.15.0 # onnxruntime-gpu for GPU support
8-
librosa>=0.9.2
98
matplotlib

0 commit comments

Comments
 (0)