11from torch .utils .data import DataLoader , RandomSampler
2- import torch , os , sys , time , argparse , numpy as np
32from utils_dataset import SQLDataset , HDF5Dataset
3+ import torch , os , time , argparse , numpy as np
44from transformers .optimization import AdamW
55from model_generator import GeneTransformer
6- from datetime import datetime , timedelta
7- from utils_logplot import LogPlot
86import utils_misc , utils_tokenizer
7+ from utils_logplot import LogPlot
8+ from datetime import datetime
99
1010from model_coverage import KeywordCoverage
1111from model_guardrails import PatternPenalty , LengthPenalty , RepeatPenalty
1717parser .add_argument ("--experiment" , type = str , required = True , help = "Experiment name. Will be used to save a model file and a log file." )
1818parser .add_argument ("--dataset_file" , type = str , required = True , help = "Which dataset file to use. Can be full path or the root folder will be attached." )
1919
20- parser .add_argument ("--root_folder" , type = str , default = "/home/" + user + "/" )
2120parser .add_argument ("--train_batch_size" , type = int , default = 5 , help = "Training batch size." )
2221parser .add_argument ("--n_epochs" , type = int , default = 3 , help = "Number of epochs to run over the data." )
2322parser .add_argument ("--optim_every" , type = int , default = 4 , help = "Optimize every x backprops. A multiplier to the true batch size." )
3433 os .environ ["CUDA_VISIBLE_DEVICES" ] = "" + str (freer_gpu )
3534 args .experiment += "_" + freer_gpu
3635
37- models_folder = "/home/ubuntu /models/"
38- log_folder = "/home/ubuntu /logs/"
36+ models_folder = "/home/phillab /models/"
37+ log_folder = "/home/phillab /logs/"
3938
4039summarizer_model_start = os .path .join (models_folder , "gpt2_copier23.bin" )
4140
@@ -65,6 +64,7 @@ def collate_func(inps):
6564 else :
6665 return [inp [0 ].decode () for inp in inps ]
6766
67+
6868param_optimizer = list (summarizer .model .named_parameters ())
6969no_decay = ['bias' , 'LayerNorm.bias' , 'LayerNorm.weight' ]
7070optimizer_grouped_parameters = [
@@ -88,9 +88,9 @@ def collate_func(inps):
8888
8989print ("Loading scorers" )
9090
91- coverage_model_file = os .path .join (models_folder , "bert_coverage .bin" )
91+ coverage_model_file = os .path .join (models_folder , "bert_coverage_google_cnndm_length15_1 .bin" )
9292coverage_keyword_model_file = os .path .join (models_folder , "keyword_extractor.joblib" )
93- fluency_news_model_file = os .path .join (models_folder , "fluency_news_bs32 .bin" )
93+ fluency_news_model_file = os .path .join (models_folder , "news_gpt2_bs32 .bin" )
9494
9595scorers = [{"name" : "coverage" , "importance" : 10.0 , "sign" : 1.0 , "model" : KeywordCoverage (args .device , keyword_model_file = coverage_keyword_model_file , model_file = coverage_model_file )},
9696 {"name" : "fluency" , "importance" : 2.0 , "sign" : 1.0 , "model" : GeneTransformer (max_output_length = args .max_output_length , device = args .device , starter_model = fluency_news_model_file )},
@@ -102,6 +102,7 @@ def collate_func(inps):
102102def background_tokenizer (bodies , out_queue ):
103103 out_queue .put ([bert_tokenizer .encode (body ) for body in bodies ])
104104
105+
105106my_queue = queue .Queue ()
106107print ("Started training" )
107108
@@ -116,7 +117,7 @@ def background_tokenizer(bodies, out_queue):
116117dataloader = DataLoader (dataset = dataset , batch_size = args .train_batch_size , sampler = RandomSampler (dataset ), drop_last = True , collate_fn = collate_func )
117118
118119for epi in range (n_epochs ):
119- print ("=================== EPOCH" ,epi , "===================" )
120+ print ("=================== EPOCH" , epi , "===================" )
120121 for ib , documents in enumerate (dataloader ):
121122 Timer = {}
122123
@@ -126,7 +127,7 @@ def background_tokenizer(bodies, out_queue):
126127 bodies = [" " .join (doc .split (" " )[:300 ]) for doc in documents ]
127128
128129 # We run tokenization in the background, as it is BERT tokenization only used after the summarizer has run. Saves about 5% of time.
129- thread1 = threading .Thread (target = background_tokenizer , args = (bodies , my_queue ))
130+ thread1 = threading .Thread (target = background_tokenizer , args = (bodies , my_queue ))
130131 # bodies_bert_tokenized = [bert_tokenizer.enncode(body) for body in bodies] # This is the not background version
131132 thread1 .start ()
132133
@@ -159,11 +160,11 @@ def background_tokenizer(bodies, out_queue):
159160 sampled_scores = torch .FloatTensor (sampled_scores ).to (args .device )
160161
161162 argmax_scores , _ = scorer ['model' ].score (argmax_summaries , bodies , bodies_tokenized = bodies_bert_tokenized , extra = extra , lengths = argmax_end_idxs )
162- argmax_scores = torch .FloatTensor (argmax_scores ).to (args .device )
163+ argmax_scores = torch .FloatTensor (argmax_scores ).to (args .device )
163164
164165 Timer ["scores_" + scorer ['name' ]] = time .time ()- T
165166 total_sampled_scores += (scorer ['sign' ])* (scorer ['importance' ])* sampled_scores
166- total_argmax_scores += (scorer ['sign' ])* (scorer ['importance' ])* argmax_scores
167+ total_argmax_scores += (scorer ['sign' ])* (scorer ['importance' ])* argmax_scores
167168 log_obj [scorer ['name' ]+ "_score" ] = sampled_scores .mean ().item ()
168169 scores_track [scorer ['name' ]+ "_scores" ] = sampled_scores
169170
@@ -180,7 +181,7 @@ def background_tokenizer(bodies, out_queue):
180181 T6 = time .time ()
181182 Timer ['backward' ] = T6 - T5
182183
183- if ib % args .optim_every == 0 :
184+ if ib % args .optim_every == 0 :
184185 optimizer .step ()
185186 optimizer .zero_grad ()
186187
@@ -220,7 +221,7 @@ def background_tokenizer(bodies, out_queue):
220221
221222 if ckpt_every > 0 and len (total_score_history ) > ckpt_lookback :
222223 current_score = np .mean (total_score_history [- ckpt_lookback :])
223-
224+
224225 if time .time ()- time_ckpt > ckpt_every :
225226 revert_ckpt = best_ckpt_score is not None and current_score < min (1.2 * best_ckpt_score , 0.8 * best_ckpt_score ) # Could be negative or positive
226227 print ("================================== CKPT TIME, " + str (datetime .now ())+ " =================================" )
@@ -232,7 +233,7 @@ def background_tokenizer(bodies, out_queue):
232233 optimizer .load_state_dict (torch .load (ckpt_optimizer_file ))
233234 time_ckpt = time .time ()
234235 print ("==============================================================================" )
235-
236+
236237 if best_ckpt_score is None or current_score > best_ckpt_score :
237238 print ("[CKPT] Saved new best at: %.3f %s" % (current_score , "[" + str (datetime .now ())+ "]" ))
238239 best_ckpt_score = current_score
0 commit comments