1+ import numpy as np
2+
3+ import tensorflow_datasets as tfds
4+ import tensorflow as tf
5+ try : [tf .config .experimental .set_memory_growth (gpu , True ) for gpu in tf .config .experimental .list_physical_devices ("GPU" )]
6+ except : pass
7+
8+ from keras .callbacks import EarlyStopping , ModelCheckpoint , ReduceLROnPlateau , TensorBoard
9+ from mltu .tensorflow .callbacks import Model2onnx
10+
11+ from mltu .tensorflow .dataProvider import DataProvider
12+ from mltu .tokenizers import CustomTokenizer
13+
14+ # from transformer import Transformer, TransformerLayer
15+ from model import Transformer
16+ from configs import ModelConfigs
17+
18+ configs = ModelConfigs ()
19+
20+ en_training_data_path = "Datasets/en-es/opus.en-es-train.en"
21+ en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en"
22+ es_training_data_path = "Datasets/en-es/opus.en-es-train.es"
23+ es_validation_data_path = "Datasets/en-es/opus.en-es-dev.es"
24+
25+ def read_files (path ):
26+ with open (path , "r" , encoding = "utf-8" ) as f :
27+ en_train_dataset = f .read ().split ("\n " )[:- 1 ]
28+ return en_train_dataset
29+
30+ en_training_data = read_files (en_training_data_path )
31+ en_validation_data = read_files (en_validation_data_path )
32+ es_training_data = read_files (es_training_data_path )
33+ es_validation_data = read_files (es_validation_data_path )
34+
35+ max_lenght = 500
36+ train_dataset = [[es_sentence , en_sentence ] for es_sentence , en_sentence in zip (es_training_data , en_training_data ) if len (es_sentence ) <= max_lenght and len (en_sentence ) <= max_lenght ]
37+ val_dataset = [[es_sentence , en_sentence ] for es_sentence , en_sentence in zip (es_validation_data , en_validation_data ) if len (es_sentence ) <= max_lenght and len (en_sentence ) <= max_lenght ]
38+ es_training_data , en_training_data = zip (* train_dataset )
39+ es_validation_data , en_validation_data = zip (* val_dataset )
40+
41+ # prepare portuguese tokenizer, this is the input language
42+ tokenizer = CustomTokenizer ()
43+ tokenizer .fit_on_texts (es_training_data )
44+ tokenizer .update (es_validation_data )
45+
46+ # prepare english tokenizer, this is the output language
47+ detokenizer = CustomTokenizer ()
48+ detokenizer .fit_on_texts (en_training_data )
49+ detokenizer .update (en_validation_data )
50+
51+
52+ # examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
53+
54+ # train_examples, val_examples = examples['train'], examples['validation']
55+
56+ # train_dataset = []
57+ # for pt, en in train_examples:
58+ # train_dataset.append([pt.numpy().decode('utf-8'), en.numpy().decode('utf-8')])
59+
60+ # val_dataset = []
61+ # for pt, en in val_examples:
62+ # val_dataset.append([pt.numpy().decode('utf-8'), en.numpy().decode('utf-8')])
63+
64+ # # prepare portuguese tokenizer
65+ # tokenizer = CustomTokenizer()
66+ # tokenizer.fit_on_texts([train_dataset[i][0] for i in range(len(train_dataset))])
67+ # tokenizer.update([val_dataset[i][0] for i in range(len(val_dataset))])
68+ # tokenizer.save(configs.model_path + "/pt_tokenizer.json")
69+
70+ # # prepare english tokenizer
71+ # detokenizer = CustomTokenizer()
72+ # detokenizer.fit_on_texts([train_dataset[i][1] for i in range(len(train_dataset))])
73+ # detokenizer.update([val_dataset[i][1] for i in range(len(val_dataset))])
74+ # detokenizer.save(configs.model_path + "/eng_tokenizer.json")
75+
76+
77+ def preprocess_inputs (data_batch , label_batch ):
78+ encoder_input = np .zeros ((len (data_batch ), tokenizer .max_length )).astype (np .int64 )
79+ decoder_input = np .zeros ((len (label_batch ), detokenizer .max_length )).astype (np .int64 )
80+ decoder_output = np .zeros ((len (label_batch ), detokenizer .max_length )).astype (np .int64 )
81+
82+ data_batch_tokens = tokenizer .texts_to_sequences (data_batch )
83+ label_batch_tokens = detokenizer .texts_to_sequences (label_batch )
84+
85+ for index , (data , label ) in enumerate (zip (data_batch_tokens , label_batch_tokens )):
86+ encoder_input [index ][:len (data )] = data
87+ decoder_input [index ][:len (label )- 1 ] = label [:- 1 ] # Drop the [END] tokens
88+ decoder_output [index ][:len (label )- 1 ] = label [1 :] # Drop the [START] tokens
89+
90+ return (encoder_input , decoder_input ), decoder_output
91+
92+ train_dataProvider = DataProvider (
93+ train_dataset ,
94+ batch_size = configs .batch_size ,
95+ shuffle = True ,
96+ batch_postprocessors = [preprocess_inputs ]
97+ )
98+
99+ # for data in train_dataProvider:
100+ # pass
101+
102+ val_dataProvider = DataProvider (
103+ val_dataset ,
104+ batch_size = configs .batch_size ,
105+ shuffle = True ,
106+ batch_postprocessors = [preprocess_inputs ]
107+ )
108+
109+ transformer = Transformer (
110+ num_layers = configs .num_layers ,
111+ d_model = configs .d_model ,
112+ num_heads = configs .num_heads ,
113+ dff = configs .dff ,
114+ input_vocab_size = len (tokenizer )+ 1 ,
115+ target_vocab_size = len (detokenizer )+ 1 ,
116+ dropout_rate = configs .dropout_rate ,
117+ encoder_input_size = tokenizer .max_length ,
118+ decoder_input_size = detokenizer .max_length
119+ )
120+
121+ transformer .summary ()
122+
123+ # transformer(train_dataProvider[0][0], training=False)
124+ # transformer.load_weights("test/model.h5")
125+
126+ # test = transformer(data[0], training=False)
127+ # transformer.summary()
128+
129+
130+ class MaskedLoss (tf .keras .losses .Loss ):
131+ def __init__ (self , mask_value = 0 , reduction = 'none' ) -> None :
132+ super (MaskedLoss , self ).__init__ ()
133+ self .mask_value = mask_value
134+ self .reduction = reduction
135+ self .loss_object = tf .keras .losses .SparseCategoricalCrossentropy (from_logits = True , reduction = reduction )
136+
137+ def __call__ (self , y_true , y_pred , sample_weight = None ):
138+ mask = y_true != self .mask_value
139+ loss = self .loss_object (y_true , y_pred )
140+
141+ mask = tf .cast (mask , dtype = loss .dtype )
142+ loss *= mask
143+
144+ loss = tf .reduce_sum (loss ) / tf .reduce_sum (mask )
145+ return loss
146+
147+ def masked_accuracy (y_true , y_pred ):
148+ pred = tf .argmax (y_pred , axis = 2 )
149+ label = tf .cast (y_true , pred .dtype )
150+ match = label == pred
151+
152+ mask = label != 0
153+
154+ match = match & mask
155+
156+ match = tf .cast (match , dtype = tf .float32 )
157+ mask = tf .cast (mask , dtype = tf .float32 )
158+ return tf .reduce_sum (match ) / tf .reduce_sum (mask )
159+
160+ # vocabulary = tf.constant(eng_tokenizer.list())
161+ # vocabulary = tf.constant(list(self.vocab))
162+ # wer = WERMetric.get_wer(self.sen_true, self.sen_pred, vocabulary).numpy()
163+
164+ # @tf.function
165+ # def wer(y_true, y_pred):
166+ # pred = tf.argmax(y_pred, axis=2)
167+ # label = tf.cast(y_true, pred.dtype)
168+
169+ # wer = WERMetric.get_wer(pred, label, vocabulary, padding=0, separator=" ")
170+
171+ # # pred_str = pt_tokenizer.detokenize(pred.numpy())
172+ # # label_str = eng_tokenizer.detokenize(label.numpy())
173+ # # wer = get_wer(pred_str, label_str)
174+
175+ # return wer
176+
177+ class CustomSchedule (tf .keras .optimizers .schedules .LearningRateSchedule ):
178+ def __init__ (self , d_model , warmup_steps = 4000 ):
179+ super ().__init__ ()
180+
181+ self .d_model = d_model
182+ self .warmup_steps = warmup_steps
183+
184+ def get_config (self ):
185+ return {"d_model" : self .d_model , "warmup_steps" : self .warmup_steps }
186+
187+ def __call__ (self , step ):
188+ step = tf .cast (step , dtype = tf .float32 )
189+ arg1 = tf .math .rsqrt (step )
190+ arg2 = step * (self .warmup_steps ** - 1.5 )
191+
192+ return tf .math .rsqrt (tf .cast (self .d_model , tf .float32 )) * tf .math .minimum (arg1 , arg2 )
193+
194+ learning_rate = CustomSchedule (configs .d_model )
195+ optimizer = tf .keras .optimizers .Adam (learning_rate = learning_rate , beta_1 = 0.9 , beta_2 = 0.98 , epsilon = 1e-9 )
196+
197+
198+ transformer .compile (
199+ loss = MaskedLoss (),
200+ optimizer = optimizer ,
201+ metrics = [masked_accuracy ],
202+ run_eagerly = False
203+ )
204+
205+
206+ # Define callbacks
207+ earlystopper = EarlyStopping (monitor = "val_masked_accuracy" , patience = 10 , verbose = 1 , mode = "max" )
208+ checkpoint = ModelCheckpoint (f"{ configs .model_path } /model.h5" , monitor = "val_masked_accuracy" , verbose = 1 , save_best_only = True , mode = "max" , save_weights_only = False )
209+ tb_callback = TensorBoard (f"{ configs .model_path } /logs" )
210+ reduceLROnPlat = ReduceLROnPlateau (monitor = "val_masked_accuracy" , factor = 0.9 , min_delta = 1e-10 , patience = 5 , verbose = 1 , mode = "max" )
211+ model2onnx = Model2onnx (f"{ configs .model_path } /model.h5" , metadata = {"tokenizer" : tokenizer .dict (), "detokenizer" : detokenizer .dict ()}, save_on_epoch_end = True )
212+
213+
214+ transformer .fit (
215+ train_dataProvider ,
216+ validation_data = val_dataProvider ,
217+ epochs = configs .train_epochs ,
218+ callbacks = [
219+ checkpoint ,
220+ tb_callback ,
221+ reduceLROnPlat ,
222+ model2onnx
223+ ]
224+ )
0 commit comments