Skip to content

Commit b036c0b

Browse files
committed
Make it explicit that diagnostic within train_epoch() refers to local progress
1 parent 9000727 commit b036c0b

File tree

2 files changed

+18
-5
lines changed

2 files changed

+18
-5
lines changed

examples/mpi_learn.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@
6767
custom_path = sys.argv[1]
6868
g.print_unique("predicting using path {}".format(custom_path))
6969

70-
7170
#####################################################
7271
# NORMALIZATION #
7372
#####################################################
@@ -101,10 +100,16 @@
101100
# TRAINING #
102101
#####################################################
103102

104-
# ensure training has a separate random seed for every worker
103+
# Prevent Keras TF backend deprecation messages from mpi_train() from
104+
# appearing jumbled with stdout, stderr msgs from above steps
105+
g.comm.Barrier()
106+
g.flush_all_inorder()
107+
108+
# reminder: ensure training has a separate random seed for every worker
105109
if not only_predict:
106110
mpi_train(conf, shot_list_train, shot_list_validate, loader,
107111
shot_list_test=shot_list_test)
112+
g.flush_all_inorder()
108113

109114
#####################################################
110115
# TESTING #

plasma/models/mpi_runner.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -581,9 +581,14 @@ def train_epoch(self):
581581
epoch_previous = self.epoch
582582
self.epoch = effective_epochs
583583
g.write_unique(
584-
'\nEpoch {:.2f} finished training ({:.2f} epochs passed)'.format(
585-
1.0 * self.epoch, self.epoch - epoch_previous)
586-
+ ' in {:.2f} seconds.\n'.format(t2 - t_start))
584+
# TODO(KGF): "a total of X epochs within this session" ?
585+
'\nFinished training epoch {:.2f} '.format(self.epoch)
586+
# TODO(KGF): "precisely/exactly X epochs just passed"?
587+
+ 'during this session ({:.2f} epochs passed)'.format(
588+
self.epoch - epoch_previous)
589+
# '\nEpoch {:.2f} finished training ({:.2f} epochs passed)'.format(
590+
# 1.0 * self.epoch, self.epoch - epoch_previous)
591+
+ ' in {:.2f} seconds\n'.format(t2 - t_start))
587592
return (step, ave_loss, curr_loss, self.num_so_far, effective_epochs)
588593

589594
def estimate_remaining_time(self, time_so_far, work_so_far, work_total):
@@ -884,7 +889,10 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
884889
(step, ave_loss, curr_loss, num_so_far,
885890
effective_epochs) = mpi_model.train_epoch()
886891
e = e_old + effective_epochs
892+
g.write_unique('Finished training of epoch {:.2f}/{}\n'.format(
893+
e, num_epochs))
887894

895+
# TODO(KGF): add diagnostic about "saving to epoch X"?
888896
loader.verbose = False # True during the first iteration
889897
if g.task_index == 0:
890898
specific_builder.save_model_weights(train_model, int(round(e)))

0 commit comments

Comments
 (0)