1+ import tensorflow as tf
2+
3+ from mltu .tensorflow .transformer .layers import EncoderLayer , Decoder , PositionalEmbedding , positional_encoding
4+
5+ class SpeechFeatureEmbedding (tf .keras .layers .Layer ):
6+ def __init__ (self , d_model = 64 ):
7+ super ().__init__ ()
8+ self .conv1 = tf .keras .layers .Conv1D (d_model , kernel_size = 3 , strides = 2 , padding = "same" , use_bias = False )
9+ self .conv2 = tf .keras .layers .Conv1D (d_model , kernel_size = 3 , strides = 2 , padding = "same" , use_bias = False )
10+ self .conv3 = tf .keras .layers .Conv1D (d_model , kernel_size = 3 , strides = 2 , padding = "same" , use_bias = False )
11+ # self.max_pooling = tf.keras.layers.MaxPooling1D(pool_size=2)
12+ self .bn = tf .keras .layers .BatchNormalization ()
13+ self .bn2 = tf .keras .layers .BatchNormalization ()
14+ self .bn3 = tf .keras .layers .BatchNormalization ()
15+ self .dropout = tf .keras .layers .Dropout (0.2 )
16+ self .linear = tf .keras .layers .Dense (d_model )
17+
18+ # expand dims to add channel dimension
19+ # self.input = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))# (inputs)
20+
21+ # Convolution layer 1
22+ self .conv2d1 = tf .keras .layers .Conv2D (filters = 32 , kernel_size = [11 , 41 ], strides = [2 , 2 ], padding = "same" , use_bias = False )# (input)
23+ self .lr1 = tf .keras .layers .LeakyReLU ()
24+ # x = layers.BatchNormalization()(x)
25+ # x = activation_layer(x, activation="leaky_relu")
26+
27+ # Convolution layer 2
28+ self .conv2d2 = tf .keras .layers .Conv2D (filters = 32 , kernel_size = [11 , 21 ], strides = [2 , 2 ], padding = "same" , use_bias = False )# (x)
29+ self .lr2 = tf .keras .layers .LeakyReLU ()
30+ self .lr3 = tf .keras .layers .LeakyReLU ()
31+ self .gelu1 = tf .keras .layers .Activation ("gelu" )
32+ self .gelu2 = tf .keras .layers .Activation ("gelu" )
33+ self .gelu3 = tf .keras .layers .Activation ("gelu" )
34+ # x = layers.BatchNormalization()(x)
35+ # x = activation_layer(x, activation="leaky_relu")
36+
37+ # Reshape the resulted volume to feed the RNNs layers
38+ # self.reshape = tf.keras.layers.Reshape((-1, x.shape[-2] * x.shape[-1]))# (x)
39+
40+
41+ def call (self , x ):
42+ # x = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(x)
43+ # x = self.conv2d1(x)
44+ # x = self.bn(x)
45+ # x = self.lr1(x)
46+ # x = self.conv2d2(x)
47+ # x = self.bn2(x)
48+ # x = self.lr2(x)
49+ # x = tf.keras.layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
50+ # x = self.linear(x)
51+
52+ x = self .conv1 (x )
53+ x = self .bn (x )
54+ # x = self.lr1(x)
55+ x = self .gelu1 (x )
56+ # x = self.max_pooling(x)
57+ x = self .conv2 (x )
58+ x = self .bn2 (x )
59+ # x = self.lr2(x)
60+ x = self .gelu2 (x )
61+ # x = self.max_pooling(x)
62+ x = self .conv3 (x )
63+ x = self .bn3 (x )
64+ # x = self.lr3(x)
65+ x = self .gelu3 (x )
66+ # x = self.linear(x)
67+ x = self .dropout (x )
68+ # x = self.max_pooling(x)
69+ return x
70+
71+
72+ class Encoder (tf .keras .layers .Layer ):
73+ """
74+ A custom TensorFlow layer that implements the Encoder. This layer is mostly used in the Transformer models
75+ for natural language processing tasks, such as machine translation, text summarization or text classification.
76+
77+ Methods:
78+ call: Performs the forward pass of the layer.
79+
80+ Attributes:
81+ d_model (int): The dimensionality of the model.
82+ num_layers (int): The number of layers in the encoder.
83+ pos_embedding (PositionalEmbedding): The positional embedding layer.
84+ enc_layers (list): The list of encoder layers.
85+ dropout (tf.keras.layers.Dropout): The dropout layer.
86+ """
87+ def __init__ (self , num_layers : int , d_model : int , num_heads : int , dff : int , dropout_rate : float = 0.1 , activation : str = "relu" ):
88+ """
89+ Constructor of the Encoder.
90+
91+ Args:
92+ num_layers (int): The number of layers in the encoder.
93+ d_model (int): The dimensionality of the model.
94+ num_heads (int): The number of heads in the multi-head attention layer.
95+ dff (int): The dimensionality of the feed-forward layer.
96+ vocab_size (int): The size of the vocabulary.
97+ dropout_rate (float): The dropout rate.
98+ """
99+ super ().__init__ ()
100+
101+ self .d_model = d_model
102+ self .num_layers = num_layers
103+
104+ # self.speech_embedding = SpeechFeatureEmbedding(d_model=d_model)
105+ self .pos_embedding = PositionalEmbedding (vocab_size = None , d_model = d_model )
106+
107+ self .enc_layers = [
108+ EncoderLayer (d_model = d_model ,
109+ num_heads = num_heads ,
110+ dff = dff ,
111+ dropout_rate = dropout_rate ,
112+ activation = activation )
113+ for _ in range (num_layers )]
114+ self .dropout = tf .keras .layers .Dropout (dropout_rate )
115+
116+ def call (self , x : tf .Tensor ) -> tf .Tensor :
117+ """
118+ The call function that performs the forward pass of the layer.
119+
120+ Args:
121+ x (tf.Tensor): The input sequence of shape (batch_size, seq_length).
122+
123+ Returns:
124+ tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model).
125+ """
126+ x = self .pos_embedding (x )
127+ # here x has shape `(batch_size, seq_len, d_model)`
128+
129+ # Add dropout.
130+ x = self .dropout (x )
131+
132+ for i in range (self .num_layers ):
133+ x = self .enc_layers [i ](x )
134+
135+ return x # Shape `(batch_size, seq_len, d_model)`.
136+
137+ def SpeechTransformer (
138+ target_vocab_size : int ,
139+ encoder_input_shape : int = None ,
140+ decoder_input_shape : int = None ,
141+ num_layers_encoder : int = 6 ,
142+ num_layers_decoder : int = 6 ,
143+ d_model : int = 512 ,
144+ num_heads : int = 8 ,
145+ dff : int = 2048 ,
146+ dropout_rate : float = 0.1 ,
147+ activation : str = "relu"
148+ ) -> tf .keras .Model :
149+ """
150+ A custom TensorFlow model that implements the Transformer architecture.
151+
152+ Args:
153+ target_vocab_size (int): The size of the target vocabulary.
154+ encoder_input_size (int): The size of the encoder input sequence.
155+ decoder_input_size (int): The size of the decoder input sequence.
156+ num_layers (int): The number of layers in the encoder and decoder.
157+ d_model (int): The dimensionality of the model.
158+ num_heads (int): The number of heads in the multi-head attention layer.
159+ dff (int): The dimensionality of the feed-forward layer.
160+ dropout_rate (float): The dropout rate.
161+
162+ Returns:
163+ A TensorFlow Keras model.
164+ """
165+ inputs = [
166+ tf .keras .layers .Input (shape = encoder_input_shape , dtype = tf .float32 ),
167+ tf .keras .layers .Input (shape = decoder_input_shape , dtype = tf .int64 )
168+ ]
169+
170+ encoder_input , decoder_input = inputs
171+
172+ speech_embedding_layer = SpeechFeatureEmbedding (d_model = d_model )(encoder_input )
173+ encoder = Encoder (num_layers = num_layers_encoder , d_model = d_model , num_heads = num_heads , dff = dff , dropout_rate = dropout_rate , activation = activation )(speech_embedding_layer )
174+ decoder = Decoder (num_layers = num_layers_decoder , d_model = d_model , num_heads = num_heads , dff = dff , vocab_size = target_vocab_size , dropout_rate = dropout_rate , activation = activation )(decoder_input , encoder )
175+
176+ output = tf .keras .layers .Dense (target_vocab_size , dtype = tf .float32 )(decoder )
177+
178+ return tf .keras .Model (inputs = inputs , outputs = output )
179+
180+
181+ # import numpy as np
182+
183+ # # vocab_size = 1000
184+ # d_model = 512
185+
186+ # # # embedding_layer = PositionalEmbedding(vocab_size, d_model)
187+
188+ # # # random_input = np.random.randint(0, vocab_size, size=(1, 100))
189+
190+
191+
192+ # speech_embedding = SpeechFeatureEmbedding(d_model=d_model)
193+ # pos_embedding = PositionalEmbedding(vocab_size=0, d_model=d_model, embedding=speech_embedding)
194+
195+ # input_shape = (1392, 193)
196+
197+ # random_input = np.random.randn(1, 1392, 193)
198+
199+ # output = pos_embedding(random_input)
0 commit comments