Skip to content

Commit edee08e

Browse files
committed
Basic TensorFlow transformer stuff
1 parent 0d4605b commit edee08e

File tree

16 files changed

+2472
-840
lines changed

16 files changed

+2472
-840
lines changed

Tutorials/09_translation_transformer/configs.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,20 @@
77
class ModelConfigs(BaseModelConfigs):
88
def __init__(self):
99
super().__init__()
10-
self.model_path = os.path.join("Models/09_translation_transformer", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
10+
self.model_path = os.path.join(
11+
"Models/09_translation_transformer",
12+
datetime.strftime(datetime.now(), "%Y%m%d%H%M"),
13+
)
1114
self.num_layers = 4
1215
self.d_model = 128
1316
self.num_heads = 8
14-
self.dff = 512
17+
self.dff = 128
1518
self.dropout_rate = 0.1
16-
self.batch_size = 16
17-
self.train_epochs = 100
19+
self.batch_size = 32
20+
self.train_epochs = 20
21+
# CustomSchedule parameters
22+
self.init_lr = 0.00001
23+
self.lr_after_warmup = 0.0005
24+
self.final_lr = 0.0001
25+
self.warmup_epochs = 2
26+
self.decay_epochs = 9
Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,45 @@
11
import tensorflow as tf
2-
from keras import layers
3-
from transformer import TransformerLayer
42

5-
def Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size,
6-
dropout_rate=0.1, encoder_input_size=None, decoder_input_size=None):
7-
inputs = [
8-
layers.Input(shape=(encoder_input_size,), dtype=tf.int64),
9-
layers.Input(shape=(decoder_input_size,), dtype=tf.int64),
10-
]
3+
from mltu.tensorflow.transformer.layers import Encoder, Decoder
4+
5+
def Transformer(
6+
input_vocab_size: int,
7+
target_vocab_size: int,
8+
encoder_input_size: int = None,
9+
decoder_input_size: int = None,
10+
num_layers: int=6,
11+
d_model: int=512,
12+
num_heads: int=8,
13+
dff: int=2048,
14+
dropout_rate: float=0.1,
15+
) -> tf.keras.Model:
16+
"""
17+
A custom TensorFlow model that implements the Transformer architecture.
18+
19+
Args:
20+
input_vocab_size (int): The size of the input vocabulary.
21+
target_vocab_size (int): The size of the target vocabulary.
22+
encoder_input_size (int): The size of the encoder input sequence.
23+
decoder_input_size (int): The size of the decoder input sequence.
24+
num_layers (int): The number of layers in the encoder and decoder.
25+
d_model (int): The dimensionality of the model.
26+
num_heads (int): The number of heads in the multi-head attention layer.
27+
dff (int): The dimensionality of the feed-forward layer.
28+
dropout_rate (float): The dropout rate.
1129
12-
transformer = TransformerLayer(num_layers=num_layers, d_model=d_model,
13-
num_heads=num_heads, dff=dff,
14-
input_vocab_size=input_vocab_size,
15-
target_vocab_size=target_vocab_size,
16-
dropout_rate=dropout_rate)(inputs)
30+
Returns:
31+
A TensorFlow Keras model.
32+
"""
33+
inputs = [
34+
tf.keras.layers.Input(shape=(encoder_input_size,), dtype=tf.int64),
35+
tf.keras.layers.Input(shape=(decoder_input_size,), dtype=tf.int64)
36+
]
1737

18-
outputs = layers.Dense(target_vocab_size)(transformer)
38+
encoder_input, decoder_input = inputs
39+
40+
encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=input_vocab_size, dropout_rate=dropout_rate)(encoder_input)
41+
decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=target_vocab_size, dropout_rate=dropout_rate)(decoder_input, encoder)
42+
43+
output = tf.keras.layers.Dense(target_vocab_size)(decoder)
1944

20-
return tf.keras.Model(inputs=inputs, outputs=outputs)
45+
return tf.keras.Model(inputs=inputs, outputs=output)

Tutorials/09_translation_transformer/pt_to_en_translator.py

Lines changed: 33 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import numpy as np
22

3-
import tensorflow_datasets as tfds
43
import tensorflow as tf
54
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
65
except: pass
@@ -11,12 +10,16 @@
1110
from mltu.tensorflow.dataProvider import DataProvider
1211
from mltu.tokenizers import CustomTokenizer
1312

14-
# from transformer import Transformer, TransformerLayer
13+
from mltu.tensorflow.transformer.utils import MaskedAccuracy, MaskedLoss
14+
from mltu.tensorflow.transformer.callbacks import EncDecSplitCallback
15+
from mltu.tensorflow.schedules import CustomSchedule
16+
1517
from model import Transformer
1618
from configs import ModelConfigs
1719

1820
configs = ModelConfigs()
1921

22+
# Path to dataset
2023
en_training_data_path = "Datasets/en-es/opus.en-es-train.en"
2124
en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en"
2225
es_training_data_path = "Datasets/en-es/opus.en-es-train.es"
@@ -32,46 +35,22 @@ def read_files(path):
3235
es_training_data = read_files(es_training_data_path)
3336
es_validation_data = read_files(es_validation_data_path)
3437

38+
# Consider only sentences with length <= 500
3539
max_lenght = 500
3640
train_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_training_data, en_training_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
3741
val_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
3842
es_training_data, en_training_data = zip(*train_dataset)
3943
es_validation_data, en_validation_data = zip(*val_dataset)
4044

4145
# prepare portuguese tokenizer, this is the input language
42-
tokenizer = CustomTokenizer()
46+
tokenizer = CustomTokenizer(char_level=True)
4347
tokenizer.fit_on_texts(es_training_data)
44-
tokenizer.update(es_validation_data)
48+
tokenizer.save(configs.model_path + "/tokenizer.json")
4549

4650
# prepare english tokenizer, this is the output language
47-
detokenizer = CustomTokenizer()
51+
detokenizer = CustomTokenizer(char_level=True)
4852
detokenizer.fit_on_texts(en_training_data)
49-
detokenizer.update(en_validation_data)
50-
51-
52-
# examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
53-
54-
# train_examples, val_examples = examples['train'], examples['validation']
55-
56-
# train_dataset = []
57-
# for pt, en in train_examples:
58-
# train_dataset.append([pt.numpy().decode('utf-8'), en.numpy().decode('utf-8')])
59-
60-
# val_dataset = []
61-
# for pt, en in val_examples:
62-
# val_dataset.append([pt.numpy().decode('utf-8'), en.numpy().decode('utf-8')])
63-
64-
# # prepare portuguese tokenizer
65-
# tokenizer = CustomTokenizer()
66-
# tokenizer.fit_on_texts([train_dataset[i][0] for i in range(len(train_dataset))])
67-
# tokenizer.update([val_dataset[i][0] for i in range(len(val_dataset))])
68-
# tokenizer.save(configs.model_path + "/pt_tokenizer.json")
69-
70-
# # prepare english tokenizer
71-
# detokenizer = CustomTokenizer()
72-
# detokenizer.fit_on_texts([train_dataset[i][1] for i in range(len(train_dataset))])
73-
# detokenizer.update([val_dataset[i][1] for i in range(len(val_dataset))])
74-
# detokenizer.save(configs.model_path + "/eng_tokenizer.json")
53+
detokenizer.save(configs.model_path + "/detokenizer.json")
7554

7655

7756
def preprocess_inputs(data_batch, label_batch):
@@ -89,23 +68,23 @@ def preprocess_inputs(data_batch, label_batch):
8968

9069
return (encoder_input, decoder_input), decoder_output
9170

71+
# Create Training Data Provider
9272
train_dataProvider = DataProvider(
9373
train_dataset,
9474
batch_size=configs.batch_size,
95-
shuffle=True,
96-
batch_postprocessors=[preprocess_inputs]
75+
batch_postprocessors=[preprocess_inputs],
76+
use_cache=True
9777
)
9878

99-
# for data in train_dataProvider:
100-
# pass
101-
79+
# Create Validation Data Provider
10280
val_dataProvider = DataProvider(
10381
val_dataset,
10482
batch_size=configs.batch_size,
105-
shuffle=True,
106-
batch_postprocessors=[preprocess_inputs]
83+
batch_postprocessors=[preprocess_inputs],
84+
use_cache=True
10785
)
10886

87+
# Create TensorFlow Transformer Model
10988
transformer = Transformer(
11089
num_layers=configs.num_layers,
11190
d_model=configs.d_model,
@@ -120,97 +99,35 @@ def preprocess_inputs(data_batch, label_batch):
12099

121100
transformer.summary()
122101

123-
# transformer(train_dataProvider[0][0], training=False)
124-
# transformer.load_weights("test/model.h5")
125-
126-
# test = transformer(data[0], training=False)
127-
# transformer.summary()
128-
129-
130-
class MaskedLoss(tf.keras.losses.Loss):
131-
def __init__(self, mask_value=0, reduction='none') -> None:
132-
super(MaskedLoss, self).__init__()
133-
self.mask_value = mask_value
134-
self.reduction = reduction
135-
self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=reduction)
136-
137-
def __call__(self, y_true, y_pred, sample_weight=None):
138-
mask = y_true != self.mask_value
139-
loss = self.loss_object(y_true, y_pred)
140-
141-
mask = tf.cast(mask, dtype=loss.dtype)
142-
loss *= mask
143-
144-
loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)
145-
return loss
146-
147-
def masked_accuracy(y_true, y_pred):
148-
pred = tf.argmax(y_pred, axis=2)
149-
label = tf.cast(y_true, pred.dtype)
150-
match = label == pred
151-
152-
mask = label != 0
102+
# Define learning rate schedule
103+
learning_rate = CustomSchedule(
104+
steps_per_epoch=len(train_dataProvider),
105+
init_lr=configs.init_lr,
106+
lr_after_warmup=configs.lr_after_warmup,
107+
final_lr=configs.final_lr,
108+
warmup_epochs=configs.warmup_epochs,
109+
decay_epochs=configs.decay_epochs,
110+
)
153111

154-
match = match & mask
155-
156-
match = tf.cast(match, dtype=tf.float32)
157-
mask = tf.cast(mask, dtype=tf.float32)
158-
return tf.reduce_sum(match) / tf.reduce_sum(mask)
159-
160-
# vocabulary = tf.constant(eng_tokenizer.list())
161-
# vocabulary = tf.constant(list(self.vocab))
162-
# wer = WERMetric.get_wer(self.sen_true, self.sen_pred, vocabulary).numpy()
163-
164-
# @tf.function
165-
# def wer(y_true, y_pred):
166-
# pred = tf.argmax(y_pred, axis=2)
167-
# label = tf.cast(y_true, pred.dtype)
168-
169-
# wer = WERMetric.get_wer(pred, label, vocabulary, padding=0, separator=" ")
170-
171-
# # pred_str = pt_tokenizer.detokenize(pred.numpy())
172-
# # label_str = eng_tokenizer.detokenize(label.numpy())
173-
# # wer = get_wer(pred_str, label_str)
174-
175-
# return wer
176-
177-
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
178-
def __init__(self, d_model, warmup_steps=4000):
179-
super().__init__()
180-
181-
self.d_model = d_model
182-
self.warmup_steps = warmup_steps
183-
184-
def get_config(self):
185-
return {"d_model": self.d_model, "warmup_steps": self.warmup_steps}
186-
187-
def __call__(self, step):
188-
step = tf.cast(step, dtype=tf.float32)
189-
arg1 = tf.math.rsqrt(step)
190-
arg2 = step * (self.warmup_steps ** -1.5)
191-
192-
return tf.math.rsqrt(tf.cast(self.d_model, tf.float32)) * tf.math.minimum(arg1, arg2)
193-
194-
learning_rate = CustomSchedule(configs.d_model)
195112
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
196113

197-
114+
# Compile the model
198115
transformer.compile(
199116
loss=MaskedLoss(),
200117
optimizer=optimizer,
201-
metrics=[masked_accuracy],
118+
metrics=[MaskedAccuracy()],
202119
run_eagerly=False
203120
)
204121

205-
206122
# Define callbacks
207-
earlystopper = EarlyStopping(monitor="val_masked_accuracy", patience=10, verbose=1, mode="max")
123+
earlystopper = EarlyStopping(monitor="val_masked_accuracy", patience=5, verbose=1, mode="max")
208124
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_masked_accuracy", verbose=1, save_best_only=True, mode="max", save_weights_only=False)
209125
tb_callback = TensorBoard(f"{configs.model_path}/logs")
210-
reduceLROnPlat = ReduceLROnPlateau(monitor="val_masked_accuracy", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="max")
211-
model2onnx = Model2onnx(f"{configs.model_path}/model.h5", metadata={"tokenizer": tokenizer.dict(), "detokenizer": detokenizer.dict()}, save_on_epoch_end=True)
212-
126+
reduceLROnPlat = ReduceLROnPlateau(monitor="val_masked_accuracy", factor=0.9, min_delta=1e-10, patience=2, verbose=1, mode="max")
127+
model2onnx = Model2onnx(f"{configs.model_path}/model.h5", metadata={"tokenizer": tokenizer.dict(), "detokenizer": detokenizer.dict()}, save_on_epoch_end=False)
128+
encDecSplitCallback = EncDecSplitCallback(configs.model_path, encoder_metadata={"tokenizer": tokenizer.dict()}, decoder_metadata={"detokenizer": detokenizer.dict()})
213129

130+
# Train the model
214131
transformer.fit(
215132
train_dataProvider,
216133
validation_data=val_dataProvider,

Tutorials/09_translation_transformer/test_metadata.py

Lines changed: 0 additions & 29 deletions
This file was deleted.

0 commit comments

Comments
 (0)