11import numpy as np
22
3- import tensorflow_datasets as tfds
43import tensorflow as tf
54try : [tf .config .experimental .set_memory_growth (gpu , True ) for gpu in tf .config .experimental .list_physical_devices ("GPU" )]
65except : pass
1110from mltu .tensorflow .dataProvider import DataProvider
1211from mltu .tokenizers import CustomTokenizer
1312
14- # from transformer import Transformer, TransformerLayer
13+ from mltu .tensorflow .transformer .utils import MaskedAccuracy , MaskedLoss
14+ from mltu .tensorflow .transformer .callbacks import EncDecSplitCallback
15+ from mltu .tensorflow .schedules import CustomSchedule
16+
1517from model import Transformer
1618from configs import ModelConfigs
1719
1820configs = ModelConfigs ()
1921
22+ # Path to dataset
2023en_training_data_path = "Datasets/en-es/opus.en-es-train.en"
2124en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en"
2225es_training_data_path = "Datasets/en-es/opus.en-es-train.es"
@@ -32,46 +35,22 @@ def read_files(path):
3235es_training_data = read_files (es_training_data_path )
3336es_validation_data = read_files (es_validation_data_path )
3437
38+ # Consider only sentences with length <= 500
3539max_lenght = 500
3640train_dataset = [[es_sentence , en_sentence ] for es_sentence , en_sentence in zip (es_training_data , en_training_data ) if len (es_sentence ) <= max_lenght and len (en_sentence ) <= max_lenght ]
3741val_dataset = [[es_sentence , en_sentence ] for es_sentence , en_sentence in zip (es_validation_data , en_validation_data ) if len (es_sentence ) <= max_lenght and len (en_sentence ) <= max_lenght ]
3842es_training_data , en_training_data = zip (* train_dataset )
3943es_validation_data , en_validation_data = zip (* val_dataset )
4044
4145# prepare portuguese tokenizer, this is the input language
42- tokenizer = CustomTokenizer ()
46+ tokenizer = CustomTokenizer (char_level = True )
4347tokenizer .fit_on_texts (es_training_data )
44- tokenizer .update ( es_validation_data )
48+ tokenizer .save ( configs . model_path + "/tokenizer.json" )
4549
4650# prepare english tokenizer, this is the output language
47- detokenizer = CustomTokenizer ()
51+ detokenizer = CustomTokenizer (char_level = True )
4852detokenizer .fit_on_texts (en_training_data )
49- detokenizer .update (en_validation_data )
50-
51-
52- # examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
53-
54- # train_examples, val_examples = examples['train'], examples['validation']
55-
56- # train_dataset = []
57- # for pt, en in train_examples:
58- # train_dataset.append([pt.numpy().decode('utf-8'), en.numpy().decode('utf-8')])
59-
60- # val_dataset = []
61- # for pt, en in val_examples:
62- # val_dataset.append([pt.numpy().decode('utf-8'), en.numpy().decode('utf-8')])
63-
64- # # prepare portuguese tokenizer
65- # tokenizer = CustomTokenizer()
66- # tokenizer.fit_on_texts([train_dataset[i][0] for i in range(len(train_dataset))])
67- # tokenizer.update([val_dataset[i][0] for i in range(len(val_dataset))])
68- # tokenizer.save(configs.model_path + "/pt_tokenizer.json")
69-
70- # # prepare english tokenizer
71- # detokenizer = CustomTokenizer()
72- # detokenizer.fit_on_texts([train_dataset[i][1] for i in range(len(train_dataset))])
73- # detokenizer.update([val_dataset[i][1] for i in range(len(val_dataset))])
74- # detokenizer.save(configs.model_path + "/eng_tokenizer.json")
53+ detokenizer .save (configs .model_path + "/detokenizer.json" )
7554
7655
7756def preprocess_inputs (data_batch , label_batch ):
@@ -89,23 +68,23 @@ def preprocess_inputs(data_batch, label_batch):
8968
9069 return (encoder_input , decoder_input ), decoder_output
9170
71+ # Create Training Data Provider
9272train_dataProvider = DataProvider (
9373 train_dataset ,
9474 batch_size = configs .batch_size ,
95- shuffle = True ,
96- batch_postprocessors = [ preprocess_inputs ]
75+ batch_postprocessors = [ preprocess_inputs ] ,
76+ use_cache = True
9777 )
9878
99- # for data in train_dataProvider:
100- # pass
101-
79+ # Create Validation Data Provider
10280val_dataProvider = DataProvider (
10381 val_dataset ,
10482 batch_size = configs .batch_size ,
105- shuffle = True ,
106- batch_postprocessors = [ preprocess_inputs ]
83+ batch_postprocessors = [ preprocess_inputs ] ,
84+ use_cache = True
10785 )
10886
87+ # Create TensorFlow Transformer Model
10988transformer = Transformer (
11089 num_layers = configs .num_layers ,
11190 d_model = configs .d_model ,
@@ -120,97 +99,35 @@ def preprocess_inputs(data_batch, label_batch):
12099
121100transformer .summary ()
122101
123- # transformer(train_dataProvider[0][0], training=False)
124- # transformer.load_weights("test/model.h5")
125-
126- # test = transformer(data[0], training=False)
127- # transformer.summary()
128-
129-
130- class MaskedLoss (tf .keras .losses .Loss ):
131- def __init__ (self , mask_value = 0 , reduction = 'none' ) -> None :
132- super (MaskedLoss , self ).__init__ ()
133- self .mask_value = mask_value
134- self .reduction = reduction
135- self .loss_object = tf .keras .losses .SparseCategoricalCrossentropy (from_logits = True , reduction = reduction )
136-
137- def __call__ (self , y_true , y_pred , sample_weight = None ):
138- mask = y_true != self .mask_value
139- loss = self .loss_object (y_true , y_pred )
140-
141- mask = tf .cast (mask , dtype = loss .dtype )
142- loss *= mask
143-
144- loss = tf .reduce_sum (loss ) / tf .reduce_sum (mask )
145- return loss
146-
147- def masked_accuracy (y_true , y_pred ):
148- pred = tf .argmax (y_pred , axis = 2 )
149- label = tf .cast (y_true , pred .dtype )
150- match = label == pred
151-
152- mask = label != 0
102+ # Define learning rate schedule
103+ learning_rate = CustomSchedule (
104+ steps_per_epoch = len (train_dataProvider ),
105+ init_lr = configs .init_lr ,
106+ lr_after_warmup = configs .lr_after_warmup ,
107+ final_lr = configs .final_lr ,
108+ warmup_epochs = configs .warmup_epochs ,
109+ decay_epochs = configs .decay_epochs ,
110+ )
153111
154- match = match & mask
155-
156- match = tf .cast (match , dtype = tf .float32 )
157- mask = tf .cast (mask , dtype = tf .float32 )
158- return tf .reduce_sum (match ) / tf .reduce_sum (mask )
159-
160- # vocabulary = tf.constant(eng_tokenizer.list())
161- # vocabulary = tf.constant(list(self.vocab))
162- # wer = WERMetric.get_wer(self.sen_true, self.sen_pred, vocabulary).numpy()
163-
164- # @tf.function
165- # def wer(y_true, y_pred):
166- # pred = tf.argmax(y_pred, axis=2)
167- # label = tf.cast(y_true, pred.dtype)
168-
169- # wer = WERMetric.get_wer(pred, label, vocabulary, padding=0, separator=" ")
170-
171- # # pred_str = pt_tokenizer.detokenize(pred.numpy())
172- # # label_str = eng_tokenizer.detokenize(label.numpy())
173- # # wer = get_wer(pred_str, label_str)
174-
175- # return wer
176-
177- class CustomSchedule (tf .keras .optimizers .schedules .LearningRateSchedule ):
178- def __init__ (self , d_model , warmup_steps = 4000 ):
179- super ().__init__ ()
180-
181- self .d_model = d_model
182- self .warmup_steps = warmup_steps
183-
184- def get_config (self ):
185- return {"d_model" : self .d_model , "warmup_steps" : self .warmup_steps }
186-
187- def __call__ (self , step ):
188- step = tf .cast (step , dtype = tf .float32 )
189- arg1 = tf .math .rsqrt (step )
190- arg2 = step * (self .warmup_steps ** - 1.5 )
191-
192- return tf .math .rsqrt (tf .cast (self .d_model , tf .float32 )) * tf .math .minimum (arg1 , arg2 )
193-
194- learning_rate = CustomSchedule (configs .d_model )
195112optimizer = tf .keras .optimizers .Adam (learning_rate = learning_rate , beta_1 = 0.9 , beta_2 = 0.98 , epsilon = 1e-9 )
196113
197-
114+ # Compile the model
198115transformer .compile (
199116 loss = MaskedLoss (),
200117 optimizer = optimizer ,
201- metrics = [masked_accuracy ],
118+ metrics = [MaskedAccuracy () ],
202119 run_eagerly = False
203120 )
204121
205-
206122# Define callbacks
207- earlystopper = EarlyStopping (monitor = "val_masked_accuracy" , patience = 10 , verbose = 1 , mode = "max" )
123+ earlystopper = EarlyStopping (monitor = "val_masked_accuracy" , patience = 5 , verbose = 1 , mode = "max" )
208124checkpoint = ModelCheckpoint (f"{ configs .model_path } /model.h5" , monitor = "val_masked_accuracy" , verbose = 1 , save_best_only = True , mode = "max" , save_weights_only = False )
209125tb_callback = TensorBoard (f"{ configs .model_path } /logs" )
210- reduceLROnPlat = ReduceLROnPlateau (monitor = "val_masked_accuracy" , factor = 0.9 , min_delta = 1e-10 , patience = 5 , verbose = 1 , mode = "max" )
211- model2onnx = Model2onnx (f"{ configs .model_path } /model.h5" , metadata = {"tokenizer" : tokenizer .dict (), "detokenizer" : detokenizer .dict ()}, save_on_epoch_end = True )
212-
126+ reduceLROnPlat = ReduceLROnPlateau (monitor = "val_masked_accuracy" , factor = 0.9 , min_delta = 1e-10 , patience = 2 , verbose = 1 , mode = "max" )
127+ model2onnx = Model2onnx (f"{ configs .model_path } /model.h5" , metadata = {"tokenizer" : tokenizer .dict (), "detokenizer" : detokenizer .dict ()}, save_on_epoch_end = False )
128+ encDecSplitCallback = EncDecSplitCallback ( configs . model_path , encoder_metadata = { "tokenizer" : tokenizer . dict ()}, decoder_metadata = { "detokenizer" : detokenizer . dict ()})
213129
130+ # Train the model
214131transformer .fit (
215132 train_dataProvider ,
216133 validation_data = val_dataProvider ,
0 commit comments