@@ -450,7 +450,6 @@ def build_callbacks(self, conf, callbacks_list):
450
450
os .makedirs (csvlog_save_path )
451
451
452
452
callbacks_list = conf ['callbacks' ]['list' ]
453
-
454
453
callbacks = [cbks .BaseLogger ()]
455
454
callbacks += [self .history ]
456
455
callbacks += [cbks .CSVLogger ("{}callbacks-{}.log" .format (
@@ -581,9 +580,10 @@ def train_epoch(self):
581
580
effective_epochs = 1.0 * self .num_so_far / num_total
582
581
epoch_previous = self .epoch
583
582
self .epoch = effective_epochs
584
- g .write_unique ('\n Epoch {:.2f} finished ({:.2f} epochs passed)' .format (
585
- 1.0 * self .epoch , self .epoch - epoch_previous )
586
- + ' in {:.2f} seconds.\n ' .format (t2 - t_start ))
583
+ g .write_unique (
584
+ '\n Epoch {:.2f} finished training ({:.2f} epochs passed)' .format (
585
+ 1.0 * self .epoch , self .epoch - epoch_previous )
586
+ + ' in {:.2f} seconds.\n ' .format (t2 - t_start ))
587
587
return (step , ave_loss , curr_loss , self .num_so_far , effective_epochs )
588
588
589
589
def estimate_remaining_time (self , time_so_far , work_so_far , work_total ):
@@ -687,6 +687,10 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
687
687
688
688
model .reset_states ()
689
689
if g .task_index == 0 :
690
+ # TODO(KGF): this appears to prepend a \n, resulting in:
691
+ # [2] loading from epoch 7
692
+ #
693
+ # 128/862 [===>..........................] - ETA: 2:20
690
694
pbar = Progbar (len (shot_list ))
691
695
shot_sublists = shot_list .sublists (conf ['model' ]['pred_batch_size' ],
692
696
do_shuffle = False , equal_size = True )
@@ -870,12 +874,13 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
870
874
cmp_fn = min
871
875
872
876
while e < (num_epochs - 1 ):
873
- g .write_unique ("begin epoch {}" .format (e ))
877
+ g .write_unique ('\n Begin training from epoch {:.2f}/{}' .format (
878
+ e , num_epochs ))
874
879
if g .task_index == 0 :
875
880
callbacks .on_epoch_begin (int (round (e )))
876
881
mpi_model .set_lr (lr * lr_decay ** e )
877
- g .write_unique ('\n Epoch {}/{}' .format (e , num_epochs ))
878
882
883
+ # KGF: core work of loop performed in next line
879
884
(step , ave_loss , curr_loss , num_so_far ,
880
885
effective_epochs ) = mpi_model .train_epoch ()
881
886
e = e_old + effective_epochs
@@ -885,9 +890,16 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
885
890
specific_builder .save_model_weights (train_model , int (round (e )))
886
891
887
892
epoch_logs = {}
893
+ g .write_unique ('Begin evaluation of epoch {:.2f}/{}\n ' .format (
894
+ e , num_epochs ))
895
+ # TODO(KGF): flush output/ MPI barrier?
896
+ # g.flush_all_inorder()
888
897
898
+ # TODO(KGF): is there a way to avoid Keras.Models.load_weights()
899
+ # repeated calls throughout mpi_make_pred*() fn calls?
889
900
_ , _ , _ , roc_area , loss = mpi_make_predictions_and_evaluate (
890
901
conf , shot_list_validate , loader )
902
+
891
903
if conf ['training' ]['ranking_difficulty_fac' ] != 1.0 :
892
904
(_ , _ , _ , roc_area_train ,
893
905
loss_train ) = mpi_make_predictions_and_evaluate (
@@ -905,31 +917,35 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
905
917
times = conf ['callbacks' ]['monitor_times' ]
906
918
areas , _ = mpi_make_predictions_and_evaluate_multiple_times (
907
919
conf , shot_list_validate , loader , times )
908
- for roc , t in zip (areas , times ):
909
- g .write_unique ('epoch {}, val_roc_{} = {}' .format (
910
- int (round (e )), t , roc ))
920
+ epoch_str = 'epoch {}, ' .format (int (round (e )))
921
+ g .write_unique (epoch_str + ' ' .join (
922
+ ['val_roc_{} = {}' .format (t , roc ) for t , roc in zip (
923
+ times , areas )]
924
+ ) + '\n ' )
911
925
if shot_list_test is not None :
912
926
areas , _ = mpi_make_predictions_and_evaluate_multiple_times (
913
927
conf , shot_list_test , loader , times )
914
- for roc , t in zip (areas , times ):
915
- g .write_unique ('epoch {}, test_roc_{} = {}' .format (
916
- int (round (e )), t , roc ))
928
+ g .write_unique (epoch_str + ' ' .join (
929
+ ['test_roc_{} = {}' .format (t , roc ) for t , roc in zip (
930
+ times , areas )]
931
+ ) + '\n ' )
917
932
918
933
epoch_logs ['val_roc' ] = roc_area
919
934
epoch_logs ['val_loss' ] = loss
920
935
epoch_logs ['train_loss' ] = ave_loss
921
936
best_so_far = cmp_fn (epoch_logs [conf ['callbacks' ]['monitor' ]],
922
937
best_so_far )
923
938
stop_training = False
939
+ g .flush_all_inorder ()
924
940
if g .task_index == 0 :
925
- print ('=========Summary======== for epoch{ }' .format (step ))
941
+ print ('=========Summary======== for epoch {:.2f }' .format (e ))
926
942
print ('Training Loss numpy: {:.3e}' .format (ave_loss ))
927
943
print ('Validation Loss: {:.3e}' .format (loss ))
928
944
print ('Validation ROC: {:.4f}' .format (roc_area ))
929
945
if conf ['training' ]['ranking_difficulty_fac' ] != 1.0 :
930
946
print ('Training Loss: {:.3e}' .format (loss_train ))
931
947
print ('Training ROC: {:.4f}' .format (roc_area_train ))
932
-
948
+ print ( '======================== ' )
933
949
callbacks .on_epoch_end (int (round (e )), epoch_logs )
934
950
if hasattr (mpi_model .model , 'stop_training' ):
935
951
stop_training = mpi_model .model .stop_training
@@ -951,7 +967,10 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
951
967
tensorboard .on_epoch_end (val_generator , val_steps ,
952
968
int (round (e )), epoch_logs )
953
969
stop_training = g .comm .bcast (stop_training , root = 0 )
954
- g .write_unique ("end epoch {}" .format (e ))
970
+ g .write_unique ('Finished evaluation of epoch {:.2f}/{}' .format (
971
+ e , num_epochs ))
972
+ # TODO(KGF): compare to old diagnostic:
973
+ # g.write_unique("end epoch {}".format(e_old))
955
974
if stop_training :
956
975
g .write_unique ("Stopping training due to early stopping" )
957
976
break
0 commit comments