31
31
from paddle .io import DataLoader , Dataset
32
32
33
33
from paddlenlp .data import Stack , Tuple , Pad
34
+ from paddlenlp .utils .tools import TimeCostAverage
34
35
from paddlenlp .transformers import BertForPretraining , BertModel , BertPretrainingCriterion
35
36
from paddlenlp .transformers import ErnieForPretraining , ErnieModel , ErniePretrainingCriterion
36
37
from paddlenlp .transformers import BertTokenizer , ErnieTokenizer
@@ -377,13 +378,13 @@ def do_train(args):
377
378
dataset_future = pool .submit (create_pretraining_dataset , data_file ,
378
379
args .max_predictions_per_seq ,
379
380
shared_file_list , args , worker_init )
380
- train_reader_cost = 0.0
381
- train_run_cost = 0.0
381
+ train_cost_avg = TimeCostAverage ()
382
+ reader_cost_avg = TimeCostAverage ()
382
383
total_samples = 0
383
- reader_start = time .time ()
384
+ batch_start = time .time ()
384
385
for step , batch in enumerate (train_data_loader ):
385
- train_reader_cost + = time .time () - reader_start
386
- train_start = time . time ( )
386
+ train_reader_cost = time .time () - batch_start
387
+ reader_cost_avg . record ( train_reader_cost )
387
388
global_step += 1
388
389
(input_ids , segment_ids , input_mask , masked_lm_positions ,
389
390
masked_lm_labels , next_sentence_labels ,
@@ -407,22 +408,23 @@ def do_train(args):
407
408
optimizer .step ()
408
409
lr_scheduler .step ()
409
410
optimizer .clear_grad ()
410
- train_run_cost += time .time () - train_start
411
411
total_samples += args .batch_size
412
+ train_run_cost = time .time () - batch_start
413
+ train_cost_avg .record (train_run_cost )
412
414
if global_step % args .logging_steps == 0 :
413
415
if paddle .distributed .get_rank () == 0 :
414
416
logger .info (
415
417
"global step: %d, epoch: %d, batch: %d, loss: %f, "
416
418
"avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec"
417
419
% (global_step , epoch , step , loss ,
418
- train_reader_cost / args .logging_steps ,
419
- (train_reader_cost + train_run_cost ) /
420
- args .logging_steps , total_samples /
421
- args .logging_steps , total_samples /
422
- (train_reader_cost + train_run_cost )))
423
- train_reader_cost = 0.0
424
- train_run_cost = 0.0
420
+ reader_cost_avg .get_average (),
421
+ train_cost_avg .get_average (), total_samples /
422
+ args .logging_steps , total_samples / (
423
+ args .logging_steps *
424
+ train_cost_avg .get_average ())))
425
425
total_samples = 0
426
+ train_cost_avg .reset ()
427
+ reader_cost_avg .reset ()
426
428
if global_step % args .save_steps == 0 :
427
429
if paddle .distributed .get_rank () == 0 :
428
430
output_dir = os .path .join (args .output_dir ,
@@ -440,7 +442,7 @@ def do_train(args):
440
442
if global_step >= args .max_steps :
441
443
del train_data_loader
442
444
return
443
- reader_start = time .time ()
445
+ batch_start = time .time ()
444
446
445
447
del train_data_loader
446
448
train_data_loader , data_file = dataset_future .result (timeout = None )
0 commit comments