Skip to content

Commit 9000727

Browse files
committed
Fix and improve diagnostics within main mpi_train() loop over epochs
Preparing to re-index epochs with 1-based indexing
1 parent 4b549ef commit 9000727

File tree

1 file changed

+34
-15
lines changed

1 file changed

+34
-15
lines changed

plasma/models/mpi_runner.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,6 @@ def build_callbacks(self, conf, callbacks_list):
450450
os.makedirs(csvlog_save_path)
451451

452452
callbacks_list = conf['callbacks']['list']
453-
454453
callbacks = [cbks.BaseLogger()]
455454
callbacks += [self.history]
456455
callbacks += [cbks.CSVLogger("{}callbacks-{}.log".format(
@@ -581,9 +580,10 @@ def train_epoch(self):
581580
effective_epochs = 1.0*self.num_so_far/num_total
582581
epoch_previous = self.epoch
583582
self.epoch = effective_epochs
584-
g.write_unique('\nEpoch {:.2f} finished ({:.2f} epochs passed)'.format(
585-
1.0 * self.epoch, self.epoch - epoch_previous)
586-
+ ' in {:.2f} seconds.\n'.format(t2 - t_start))
583+
g.write_unique(
584+
'\nEpoch {:.2f} finished training ({:.2f} epochs passed)'.format(
585+
1.0 * self.epoch, self.epoch - epoch_previous)
586+
+ ' in {:.2f} seconds.\n'.format(t2 - t_start))
587587
return (step, ave_loss, curr_loss, self.num_so_far, effective_epochs)
588588

589589
def estimate_remaining_time(self, time_so_far, work_so_far, work_total):
@@ -687,6 +687,10 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
687687

688688
model.reset_states()
689689
if g.task_index == 0:
690+
# TODO(KGF): this appears to prepend a \n, resulting in:
691+
# [2] loading from epoch 7
692+
#
693+
# 128/862 [===>..........................] - ETA: 2:20
690694
pbar = Progbar(len(shot_list))
691695
shot_sublists = shot_list.sublists(conf['model']['pred_batch_size'],
692696
do_shuffle=False, equal_size=True)
@@ -870,12 +874,13 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
870874
cmp_fn = min
871875

872876
while e < (num_epochs - 1):
873-
g.write_unique("begin epoch {}".format(e))
877+
g.write_unique('\nBegin training from epoch {:.2f}/{}'.format(
878+
e, num_epochs))
874879
if g.task_index == 0:
875880
callbacks.on_epoch_begin(int(round(e)))
876881
mpi_model.set_lr(lr*lr_decay**e)
877-
g.write_unique('\nEpoch {}/{}'.format(e, num_epochs))
878882

883+
# KGF: core work of loop performed in next line
879884
(step, ave_loss, curr_loss, num_so_far,
880885
effective_epochs) = mpi_model.train_epoch()
881886
e = e_old + effective_epochs
@@ -885,9 +890,16 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
885890
specific_builder.save_model_weights(train_model, int(round(e)))
886891

887892
epoch_logs = {}
893+
g.write_unique('Begin evaluation of epoch {:.2f}/{}\n'.format(
894+
e, num_epochs))
895+
# TODO(KGF): flush output/ MPI barrier?
896+
# g.flush_all_inorder()
888897

898+
# TODO(KGF): is there a way to avoid Keras.Models.load_weights()
899+
# repeated calls throughout mpi_make_pred*() fn calls?
889900
_, _, _, roc_area, loss = mpi_make_predictions_and_evaluate(
890901
conf, shot_list_validate, loader)
902+
891903
if conf['training']['ranking_difficulty_fac'] != 1.0:
892904
(_, _, _, roc_area_train,
893905
loss_train) = mpi_make_predictions_and_evaluate(
@@ -905,31 +917,35 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
905917
times = conf['callbacks']['monitor_times']
906918
areas, _ = mpi_make_predictions_and_evaluate_multiple_times(
907919
conf, shot_list_validate, loader, times)
908-
for roc, t in zip(areas, times):
909-
g.write_unique('epoch {}, val_roc_{} = {}'.format(
910-
int(round(e)), t, roc))
920+
epoch_str = 'epoch {}, '.format(int(round(e)))
921+
g.write_unique(epoch_str + ' '.join(
922+
['val_roc_{} = {}'.format(t, roc) for t, roc in zip(
923+
times, areas)]
924+
) + '\n')
911925
if shot_list_test is not None:
912926
areas, _ = mpi_make_predictions_and_evaluate_multiple_times(
913927
conf, shot_list_test, loader, times)
914-
for roc, t in zip(areas, times):
915-
g.write_unique('epoch {}, test_roc_{} = {}'.format(
916-
int(round(e)), t, roc))
928+
g.write_unique(epoch_str + ' '.join(
929+
['test_roc_{} = {}'.format(t, roc) for t, roc in zip(
930+
times, areas)]
931+
) + '\n')
917932

918933
epoch_logs['val_roc'] = roc_area
919934
epoch_logs['val_loss'] = loss
920935
epoch_logs['train_loss'] = ave_loss
921936
best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']],
922937
best_so_far)
923938
stop_training = False
939+
g.flush_all_inorder()
924940
if g.task_index == 0:
925-
print('=========Summary======== for epoch{}'.format(step))
941+
print('=========Summary======== for epoch {:.2f}'.format(e))
926942
print('Training Loss numpy: {:.3e}'.format(ave_loss))
927943
print('Validation Loss: {:.3e}'.format(loss))
928944
print('Validation ROC: {:.4f}'.format(roc_area))
929945
if conf['training']['ranking_difficulty_fac'] != 1.0:
930946
print('Training Loss: {:.3e}'.format(loss_train))
931947
print('Training ROC: {:.4f}'.format(roc_area_train))
932-
948+
print('======================== ')
933949
callbacks.on_epoch_end(int(round(e)), epoch_logs)
934950
if hasattr(mpi_model.model, 'stop_training'):
935951
stop_training = mpi_model.model.stop_training
@@ -951,7 +967,10 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
951967
tensorboard.on_epoch_end(val_generator, val_steps,
952968
int(round(e)), epoch_logs)
953969
stop_training = g.comm.bcast(stop_training, root=0)
954-
g.write_unique("end epoch {}".format(e))
970+
g.write_unique('Finished evaluation of epoch {:.2f}/{}'.format(
971+
e, num_epochs))
972+
# TODO(KGF): compare to old diagnostic:
973+
# g.write_unique("end epoch {}".format(e_old))
955974
if stop_training:
956975
g.write_unique("Stopping training due to early stopping")
957976
break

0 commit comments

Comments
 (0)