Skip to content

Commit e947404

Browse files
committed
save new stuff related to audio processing
1 parent df58e7c commit e947404

File tree

12 files changed

+1305
-60
lines changed

12 files changed

+1305
-60
lines changed

Tutorials/10_speech_transformer/augment_audio.ipynb

Lines changed: 312 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import os
2+
from datetime import datetime
3+
4+
from mltu.configs import BaseModelConfigs
5+
6+
7+
class ModelConfigs(BaseModelConfigs):
8+
def __init__(self):
9+
super().__init__()
10+
self.model_path = os.path.join(
11+
"Models/10_speech_transformer",
12+
datetime.strftime(datetime.now(), "%Y%m%d%H%M"),
13+
)
14+
self.frame_length = 256
15+
self.frame_step = 160
16+
self.fft_length = 384
17+
18+
self.input_shape = (2048, 193)
19+
self.max_spectrogram_length = 2048
20+
21+
self.batch_size = 128
22+
self.train_epochs = 1000
23+
self.train_workers = 20
24+
25+
self.num_layers_encoder = 2
26+
self.num_layers_decoder = 2
27+
self.d_model = 512
28+
self.num_heads = 8
29+
self.dff = 2048
30+
self.dropout_rate = 0.1
31+
32+
33+
self.init_lr = 0.000001
34+
self.lr_after_warmup = 0.0003
35+
self.final_lr = 0.0001
36+
self.warmup_epochs = 3
37+
self.decay_epochs = 40
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
import tensorflow as tf
2+
3+
from mltu.tensorflow.transformer.layers import EncoderLayer, Decoder, PositionalEmbedding, positional_encoding
4+
5+
class SpeechFeatureEmbedding(tf.keras.layers.Layer):
6+
def __init__(self, d_model=64):
7+
super().__init__()
8+
self.conv1 = tf.keras.layers.Conv1D(d_model, kernel_size=3, strides=2, padding="same", use_bias=False)
9+
self.conv2 = tf.keras.layers.Conv1D(d_model, kernel_size=3, strides=2, padding="same", use_bias=False)
10+
self.conv3 = tf.keras.layers.Conv1D(d_model, kernel_size=3, strides=2, padding="same", use_bias=False)
11+
# self.max_pooling = tf.keras.layers.MaxPooling1D(pool_size=2)
12+
self.bn = tf.keras.layers.BatchNormalization()
13+
self.bn2 = tf.keras.layers.BatchNormalization()
14+
self.bn3 = tf.keras.layers.BatchNormalization()
15+
self.dropout = tf.keras.layers.Dropout(0.2)
16+
self.linear = tf.keras.layers.Dense(d_model)
17+
18+
# expand dims to add channel dimension
19+
# self.input = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))# (inputs)
20+
21+
# Convolution layer 1
22+
self.conv2d1 = tf.keras.layers.Conv2D(filters=32, kernel_size=[11, 41], strides=[2, 2], padding="same", use_bias=False)# (input)
23+
self.lr1 = tf.keras.layers.LeakyReLU()
24+
# x = layers.BatchNormalization()(x)
25+
# x = activation_layer(x, activation="leaky_relu")
26+
27+
# Convolution layer 2
28+
self.conv2d2 = tf.keras.layers.Conv2D(filters=32, kernel_size=[11, 21], strides=[2, 2], padding="same", use_bias=False)# (x)
29+
self.lr2 = tf.keras.layers.LeakyReLU()
30+
self.lr3 = tf.keras.layers.LeakyReLU()
31+
self.gelu1 = tf.keras.layers.Activation("gelu")
32+
self.gelu2 = tf.keras.layers.Activation("gelu")
33+
self.gelu3 = tf.keras.layers.Activation("gelu")
34+
# x = layers.BatchNormalization()(x)
35+
# x = activation_layer(x, activation="leaky_relu")
36+
37+
# Reshape the resulted volume to feed the RNNs layers
38+
# self.reshape = tf.keras.layers.Reshape((-1, x.shape[-2] * x.shape[-1]))# (x)
39+
40+
41+
def call(self, x):
42+
# x = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(x)
43+
# x = self.conv2d1(x)
44+
# x = self.bn(x)
45+
# x = self.lr1(x)
46+
# x = self.conv2d2(x)
47+
# x = self.bn2(x)
48+
# x = self.lr2(x)
49+
# x = tf.keras.layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
50+
# x = self.linear(x)
51+
52+
x = self.conv1(x)
53+
x = self.bn(x)
54+
# x = self.lr1(x)
55+
x = self.gelu1(x)
56+
# x = self.max_pooling(x)
57+
x = self.conv2(x)
58+
x = self.bn2(x)
59+
# x = self.lr2(x)
60+
x = self.gelu2(x)
61+
# x = self.max_pooling(x)
62+
x = self.conv3(x)
63+
x = self.bn3(x)
64+
# x = self.lr3(x)
65+
x = self.gelu3(x)
66+
# x = self.linear(x)
67+
x = self.dropout(x)
68+
# x = self.max_pooling(x)
69+
return x
70+
71+
72+
class Encoder(tf.keras.layers.Layer):
73+
"""
74+
A custom TensorFlow layer that implements the Encoder. This layer is mostly used in the Transformer models
75+
for natural language processing tasks, such as machine translation, text summarization or text classification.
76+
77+
Methods:
78+
call: Performs the forward pass of the layer.
79+
80+
Attributes:
81+
d_model (int): The dimensionality of the model.
82+
num_layers (int): The number of layers in the encoder.
83+
pos_embedding (PositionalEmbedding): The positional embedding layer.
84+
enc_layers (list): The list of encoder layers.
85+
dropout (tf.keras.layers.Dropout): The dropout layer.
86+
"""
87+
def __init__(self, num_layers: int, d_model: int, num_heads: int, dff: int, dropout_rate: float=0.1, activation: str="relu"):
88+
"""
89+
Constructor of the Encoder.
90+
91+
Args:
92+
num_layers (int): The number of layers in the encoder.
93+
d_model (int): The dimensionality of the model.
94+
num_heads (int): The number of heads in the multi-head attention layer.
95+
dff (int): The dimensionality of the feed-forward layer.
96+
vocab_size (int): The size of the vocabulary.
97+
dropout_rate (float): The dropout rate.
98+
"""
99+
super().__init__()
100+
101+
self.d_model = d_model
102+
self.num_layers = num_layers
103+
104+
# self.speech_embedding = SpeechFeatureEmbedding(d_model=d_model)
105+
self.pos_embedding = PositionalEmbedding(vocab_size=None, d_model=d_model)
106+
107+
self.enc_layers = [
108+
EncoderLayer(d_model=d_model,
109+
num_heads=num_heads,
110+
dff=dff,
111+
dropout_rate=dropout_rate,
112+
activation=activation)
113+
for _ in range(num_layers)]
114+
self.dropout = tf.keras.layers.Dropout(dropout_rate)
115+
116+
def call(self, x: tf.Tensor) -> tf.Tensor:
117+
"""
118+
The call function that performs the forward pass of the layer.
119+
120+
Args:
121+
x (tf.Tensor): The input sequence of shape (batch_size, seq_length).
122+
123+
Returns:
124+
tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model).
125+
"""
126+
x = self.pos_embedding(x)
127+
# here x has shape `(batch_size, seq_len, d_model)`
128+
129+
# Add dropout.
130+
x = self.dropout(x)
131+
132+
for i in range(self.num_layers):
133+
x = self.enc_layers[i](x)
134+
135+
return x # Shape `(batch_size, seq_len, d_model)`.
136+
137+
def SpeechTransformer(
138+
target_vocab_size: int,
139+
encoder_input_shape: int = None,
140+
decoder_input_shape: int = None,
141+
num_layers_encoder: int=6,
142+
num_layers_decoder: int=6,
143+
d_model: int=512,
144+
num_heads: int=8,
145+
dff: int=2048,
146+
dropout_rate: float=0.1,
147+
activation: str="relu"
148+
) -> tf.keras.Model:
149+
"""
150+
A custom TensorFlow model that implements the Transformer architecture.
151+
152+
Args:
153+
target_vocab_size (int): The size of the target vocabulary.
154+
encoder_input_size (int): The size of the encoder input sequence.
155+
decoder_input_size (int): The size of the decoder input sequence.
156+
num_layers (int): The number of layers in the encoder and decoder.
157+
d_model (int): The dimensionality of the model.
158+
num_heads (int): The number of heads in the multi-head attention layer.
159+
dff (int): The dimensionality of the feed-forward layer.
160+
dropout_rate (float): The dropout rate.
161+
162+
Returns:
163+
A TensorFlow Keras model.
164+
"""
165+
inputs = [
166+
tf.keras.layers.Input(shape=encoder_input_shape, dtype=tf.float32),
167+
tf.keras.layers.Input(shape=decoder_input_shape, dtype=tf.int64)
168+
]
169+
170+
encoder_input, decoder_input = inputs
171+
172+
speech_embedding_layer = SpeechFeatureEmbedding(d_model=d_model)(encoder_input)
173+
encoder = Encoder(num_layers=num_layers_encoder, d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate, activation=activation)(speech_embedding_layer)
174+
decoder = Decoder(num_layers=num_layers_decoder, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=target_vocab_size, dropout_rate=dropout_rate, activation=activation)(decoder_input, encoder)
175+
176+
output = tf.keras.layers.Dense(target_vocab_size, dtype=tf.float32)(decoder)
177+
178+
return tf.keras.Model(inputs=inputs, outputs=output)
179+
180+
181+
# import numpy as np
182+
183+
# # vocab_size = 1000
184+
# d_model = 512
185+
186+
# # # embedding_layer = PositionalEmbedding(vocab_size, d_model)
187+
188+
# # # random_input = np.random.randint(0, vocab_size, size=(1, 100))
189+
190+
191+
192+
# speech_embedding = SpeechFeatureEmbedding(d_model=d_model)
193+
# pos_embedding = PositionalEmbedding(vocab_size=0, d_model=d_model, embedding=speech_embedding)
194+
195+
# input_shape = (1392, 193)
196+
197+
# random_input = np.random.randn(1, 1392, 193)
198+
199+
# output = pos_embedding(random_input)

Tutorials/10_speech_transformer/test.py

Whitespace-only changes.

0 commit comments

Comments
 (0)