Skip to content

Commit 54d9b99

Browse files
committed
Save transformers stuff
1 parent 6a4eb20 commit 54d9b99

File tree

15 files changed

+1793
-12
lines changed

15 files changed

+1793
-12
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,6 @@ dist
1010
!*.md
1111

1212
.idea
13-
.python-version
13+
.python-version
14+
15+
test

Tests/test_tensorflow_metrics.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import unittest
2+
import numpy as np
3+
from mltu.tensorflow.metrics import CERMetric, WERMetric
4+
5+
import numpy as np
6+
import tensorflow as tf
7+
8+
class TestMetrics(unittest.TestCase):
9+
10+
def to_embeddings(self, sentences, vocab):
11+
embeddings, max_len = [], 0
12+
13+
for sentence in sentences:
14+
embedding = []
15+
for character in sentence:
16+
embedding.append(vocab.index(character))
17+
embeddings.append(embedding)
18+
max_len = max(max_len, len(embedding))
19+
return embeddings, max_len
20+
21+
def setUp(self) -> None:
22+
true_words = ["Who are you", "I am a student", "I am a teacher", "Just different sentence length"]
23+
pred_words = ["Who are you", "I am a ztudent", "I am A reacher", "Just different length"]
24+
25+
vocab = set()
26+
for sen in true_words + pred_words:
27+
for character in sen:
28+
vocab.add(character)
29+
self.vocab = "".join(vocab)
30+
31+
sentence_true, max_len_true = self.to_embeddings(true_words, self.vocab)
32+
sentence_pred, max_len_pred = self.to_embeddings(pred_words, self.vocab)
33+
34+
max_len = max(max_len_true, max_len_pred)
35+
padding_length = 64
36+
37+
self.sen_true = [np.pad(sen, (0, max_len - len(sen)), "constant", constant_values=len(self.vocab)) for sen in sentence_true]
38+
self.sen_pred = [np.pad(sen, (0, padding_length - len(sen)), "constant", constant_values=-1) for sen in sentence_pred]
39+
40+
def test_CERMetric(self):
41+
vocabulary = tf.constant(list(self.vocab))
42+
cer = CERMetric.get_cer(self.sen_true, self.sen_pred, vocabulary).numpy()
43+
44+
self.assertTrue(np.array_equal(cer, np.array([0.0, 0.071428575, 0.14285715, 0.42857143], dtype=np.float32)))
45+
46+
def test_WERMetric(self):
47+
vocabulary = tf.constant(list(self.vocab))
48+
wer = WERMetric.get_wer(self.sen_true, self.sen_pred, vocabulary).numpy()
49+
50+
self.assertTrue(np.array_equal(wer, np.array([0., 0.25, 0.5, 0.33333334], dtype=np.float32)))
51+
52+
if __name__ == "__main__":
53+
unittest.main()
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import os
2+
from datetime import datetime
3+
4+
from mltu.configs import BaseModelConfigs
5+
6+
7+
class ModelConfigs(BaseModelConfigs):
8+
def __init__(self):
9+
super().__init__()
10+
self.model_path = os.path.join("Models/09_translation_transformer", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
11+
self.num_layers = 4
12+
self.d_model = 128
13+
self.num_heads = 8
14+
self.dff = 512
15+
self.dropout_rate = 0.1
16+
self.batch_size = 16
17+
self.train_epochs = 100
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
2+
import os
3+
import requests
4+
from tqdm import tqdm
5+
from bs4 import BeautifulSoup
6+
7+
# URL to the directory containing the files to be downloaded
8+
language = "en-es"
9+
url = f"https://data.statmt.org/opus-100-corpus/v1.0/supervised/{language}/"
10+
save_directory = f"./Datasets/{language}"
11+
12+
# Create the save directory if it doesn't exist
13+
os.makedirs(save_directory, exist_ok=True)
14+
15+
# Send a GET request to the URL
16+
response = requests.get(url)
17+
18+
# Parse the HTML response
19+
soup = BeautifulSoup(response.content, 'html.parser')
20+
21+
# Find all the anchor tags in the HTML
22+
links = soup.find_all('a')
23+
24+
# Extract the href attribute from each anchor tag
25+
file_links = [link['href'] for link in links if '.' in link['href']]
26+
27+
# Download each file
28+
for file_link in tqdm(file_links):
29+
file_url = url + file_link
30+
save_path = os.path.join(save_directory, file_link)
31+
32+
print(f"Downloading {file_url}")
33+
34+
# Send a GET request for the file
35+
file_response = requests.get(file_url)
36+
if file_response.status_code == 404:
37+
print(f"Could not download {file_url}")
38+
continue
39+
40+
# Save the file to the specified directory
41+
with open(save_path, 'wb') as file:
42+
file.write(file_response.content)
43+
44+
print(f"Saved {file_link}")
45+
46+
print("All files have been downloaded.")
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import tensorflow as tf
2+
from keras import layers
3+
from transformer import TransformerLayer
4+
5+
def Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size,
6+
dropout_rate=0.1, encoder_input_size=None, decoder_input_size=None):
7+
inputs = [
8+
layers.Input(shape=(encoder_input_size,), dtype=tf.int64),
9+
layers.Input(shape=(decoder_input_size,), dtype=tf.int64),
10+
]
11+
12+
transformer = TransformerLayer(num_layers=num_layers, d_model=d_model,
13+
num_heads=num_heads, dff=dff,
14+
input_vocab_size=input_vocab_size,
15+
target_vocab_size=target_vocab_size,
16+
dropout_rate=dropout_rate)(inputs)
17+
18+
outputs = layers.Dense(target_vocab_size)(transformer)
19+
20+
return tf.keras.Model(inputs=inputs, outputs=outputs)
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
import numpy as np
2+
3+
import tensorflow_datasets as tfds
4+
import tensorflow as tf
5+
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
6+
except: pass
7+
8+
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
9+
from mltu.tensorflow.callbacks import Model2onnx
10+
11+
from mltu.tensorflow.dataProvider import DataProvider
12+
from mltu.tokenizers import CustomTokenizer
13+
14+
# from transformer import Transformer, TransformerLayer
15+
from model import Transformer
16+
from configs import ModelConfigs
17+
18+
configs = ModelConfigs()
19+
20+
en_training_data_path = "Datasets/en-es/opus.en-es-train.en"
21+
en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en"
22+
es_training_data_path = "Datasets/en-es/opus.en-es-train.es"
23+
es_validation_data_path = "Datasets/en-es/opus.en-es-dev.es"
24+
25+
def read_files(path):
26+
with open(path, "r", encoding="utf-8") as f:
27+
en_train_dataset = f.read().split("\n")[:-1]
28+
return en_train_dataset
29+
30+
en_training_data = read_files(en_training_data_path)
31+
en_validation_data = read_files(en_validation_data_path)
32+
es_training_data = read_files(es_training_data_path)
33+
es_validation_data = read_files(es_validation_data_path)
34+
35+
max_lenght = 500
36+
train_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_training_data, en_training_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
37+
val_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
38+
es_training_data, en_training_data = zip(*train_dataset)
39+
es_validation_data, en_validation_data = zip(*val_dataset)
40+
41+
# prepare portuguese tokenizer, this is the input language
42+
tokenizer = CustomTokenizer()
43+
tokenizer.fit_on_texts(es_training_data)
44+
tokenizer.update(es_validation_data)
45+
46+
# prepare english tokenizer, this is the output language
47+
detokenizer = CustomTokenizer()
48+
detokenizer.fit_on_texts(en_training_data)
49+
detokenizer.update(en_validation_data)
50+
51+
52+
# examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
53+
54+
# train_examples, val_examples = examples['train'], examples['validation']
55+
56+
# train_dataset = []
57+
# for pt, en in train_examples:
58+
# train_dataset.append([pt.numpy().decode('utf-8'), en.numpy().decode('utf-8')])
59+
60+
# val_dataset = []
61+
# for pt, en in val_examples:
62+
# val_dataset.append([pt.numpy().decode('utf-8'), en.numpy().decode('utf-8')])
63+
64+
# # prepare portuguese tokenizer
65+
# tokenizer = CustomTokenizer()
66+
# tokenizer.fit_on_texts([train_dataset[i][0] for i in range(len(train_dataset))])
67+
# tokenizer.update([val_dataset[i][0] for i in range(len(val_dataset))])
68+
# tokenizer.save(configs.model_path + "/pt_tokenizer.json")
69+
70+
# # prepare english tokenizer
71+
# detokenizer = CustomTokenizer()
72+
# detokenizer.fit_on_texts([train_dataset[i][1] for i in range(len(train_dataset))])
73+
# detokenizer.update([val_dataset[i][1] for i in range(len(val_dataset))])
74+
# detokenizer.save(configs.model_path + "/eng_tokenizer.json")
75+
76+
77+
def preprocess_inputs(data_batch, label_batch):
78+
encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
79+
decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
80+
decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
81+
82+
data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
83+
label_batch_tokens = detokenizer.texts_to_sequences(label_batch)
84+
85+
for index, (data, label) in enumerate(zip(data_batch_tokens, label_batch_tokens)):
86+
encoder_input[index][:len(data)] = data
87+
decoder_input[index][:len(label)-1] = label[:-1] # Drop the [END] tokens
88+
decoder_output[index][:len(label)-1] = label[1:] # Drop the [START] tokens
89+
90+
return (encoder_input, decoder_input), decoder_output
91+
92+
train_dataProvider = DataProvider(
93+
train_dataset,
94+
batch_size=configs.batch_size,
95+
shuffle=True,
96+
batch_postprocessors=[preprocess_inputs]
97+
)
98+
99+
# for data in train_dataProvider:
100+
# pass
101+
102+
val_dataProvider = DataProvider(
103+
val_dataset,
104+
batch_size=configs.batch_size,
105+
shuffle=True,
106+
batch_postprocessors=[preprocess_inputs]
107+
)
108+
109+
transformer = Transformer(
110+
num_layers=configs.num_layers,
111+
d_model=configs.d_model,
112+
num_heads=configs.num_heads,
113+
dff=configs.dff,
114+
input_vocab_size=len(tokenizer)+1,
115+
target_vocab_size=len(detokenizer)+1,
116+
dropout_rate=configs.dropout_rate,
117+
encoder_input_size=tokenizer.max_length,
118+
decoder_input_size=detokenizer.max_length
119+
)
120+
121+
transformer.summary()
122+
123+
# transformer(train_dataProvider[0][0], training=False)
124+
# transformer.load_weights("test/model.h5")
125+
126+
# test = transformer(data[0], training=False)
127+
# transformer.summary()
128+
129+
130+
class MaskedLoss(tf.keras.losses.Loss):
131+
def __init__(self, mask_value=0, reduction='none') -> None:
132+
super(MaskedLoss, self).__init__()
133+
self.mask_value = mask_value
134+
self.reduction = reduction
135+
self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=reduction)
136+
137+
def __call__(self, y_true, y_pred, sample_weight=None):
138+
mask = y_true != self.mask_value
139+
loss = self.loss_object(y_true, y_pred)
140+
141+
mask = tf.cast(mask, dtype=loss.dtype)
142+
loss *= mask
143+
144+
loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)
145+
return loss
146+
147+
def masked_accuracy(y_true, y_pred):
148+
pred = tf.argmax(y_pred, axis=2)
149+
label = tf.cast(y_true, pred.dtype)
150+
match = label == pred
151+
152+
mask = label != 0
153+
154+
match = match & mask
155+
156+
match = tf.cast(match, dtype=tf.float32)
157+
mask = tf.cast(mask, dtype=tf.float32)
158+
return tf.reduce_sum(match) / tf.reduce_sum(mask)
159+
160+
# vocabulary = tf.constant(eng_tokenizer.list())
161+
# vocabulary = tf.constant(list(self.vocab))
162+
# wer = WERMetric.get_wer(self.sen_true, self.sen_pred, vocabulary).numpy()
163+
164+
# @tf.function
165+
# def wer(y_true, y_pred):
166+
# pred = tf.argmax(y_pred, axis=2)
167+
# label = tf.cast(y_true, pred.dtype)
168+
169+
# wer = WERMetric.get_wer(pred, label, vocabulary, padding=0, separator=" ")
170+
171+
# # pred_str = pt_tokenizer.detokenize(pred.numpy())
172+
# # label_str = eng_tokenizer.detokenize(label.numpy())
173+
# # wer = get_wer(pred_str, label_str)
174+
175+
# return wer
176+
177+
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
178+
def __init__(self, d_model, warmup_steps=4000):
179+
super().__init__()
180+
181+
self.d_model = d_model
182+
self.warmup_steps = warmup_steps
183+
184+
def get_config(self):
185+
return {"d_model": self.d_model, "warmup_steps": self.warmup_steps}
186+
187+
def __call__(self, step):
188+
step = tf.cast(step, dtype=tf.float32)
189+
arg1 = tf.math.rsqrt(step)
190+
arg2 = step * (self.warmup_steps ** -1.5)
191+
192+
return tf.math.rsqrt(tf.cast(self.d_model, tf.float32)) * tf.math.minimum(arg1, arg2)
193+
194+
learning_rate = CustomSchedule(configs.d_model)
195+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
196+
197+
198+
transformer.compile(
199+
loss=MaskedLoss(),
200+
optimizer=optimizer,
201+
metrics=[masked_accuracy],
202+
run_eagerly=False
203+
)
204+
205+
206+
# Define callbacks
207+
earlystopper = EarlyStopping(monitor="val_masked_accuracy", patience=10, verbose=1, mode="max")
208+
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_masked_accuracy", verbose=1, save_best_only=True, mode="max", save_weights_only=False)
209+
tb_callback = TensorBoard(f"{configs.model_path}/logs")
210+
reduceLROnPlat = ReduceLROnPlateau(monitor="val_masked_accuracy", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="max")
211+
model2onnx = Model2onnx(f"{configs.model_path}/model.h5", metadata={"tokenizer": tokenizer.dict(), "detokenizer": detokenizer.dict()}, save_on_epoch_end=True)
212+
213+
214+
transformer.fit(
215+
train_dataProvider,
216+
validation_data=val_dataProvider,
217+
epochs=configs.train_epochs,
218+
callbacks=[
219+
checkpoint,
220+
tb_callback,
221+
reduceLROnPlat,
222+
model2onnx
223+
]
224+
)

0 commit comments

Comments
 (0)