Make it explicit that diagnostic within train_epoch() refers to local progress

felker · felker · commit b036c0bb2c21 · 2019-11-07T14:31:42.000-05:00
diff --git a/examples/mpi_learn.py b/examples/mpi_learn.py
@@ -67,7 +67,6 @@
     custom_path = sys.argv[1]
     g.print_unique("predicting using path {}".format(custom_path))
 
-
 #####################################################
 #                 NORMALIZATION                     #
 #####################################################
@@ -101,10 +100,16 @@
 #                    TRAINING                       #
 #####################################################
 
-# ensure training has a separate random seed for every worker
+# Prevent Keras TF backend deprecation messages from mpi_train() from
+# appearing jumbled with stdout, stderr msgs from above steps
+g.comm.Barrier()
+g.flush_all_inorder()
+
+# reminder: ensure training has a separate random seed for every worker
 if not only_predict:
     mpi_train(conf, shot_list_train, shot_list_validate, loader,
               shot_list_test=shot_list_test)
+g.flush_all_inorder()
 
 #####################################################
 #                    TESTING                        #
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
@@ -581,9 +581,14 @@ def train_epoch(self):
         epoch_previous = self.epoch
         self.epoch = effective_epochs
         g.write_unique(
-            '\nEpoch {:.2f} finished training ({:.2f} epochs passed)'.format(
-                1.0 * self.epoch, self.epoch - epoch_previous)
-            + ' in {:.2f} seconds.\n'.format(t2 - t_start))
+            # TODO(KGF): "a total of X epochs within this session" ?
+            '\nFinished training epoch {:.2f} '.format(self.epoch)
+            # TODO(KGF): "precisely/exactly X epochs just passed"?
+            + 'during this session ({:.2f} epochs passed)'.format(
+                self.epoch - epoch_previous)
+            # '\nEpoch {:.2f} finished training ({:.2f} epochs passed)'.format(
+            #     1.0 * self.epoch, self.epoch - epoch_previous)
+            + ' in {:.2f} seconds\n'.format(t2 - t_start))
         return (step, ave_loss, curr_loss, self.num_so_far, effective_epochs)
 
     def estimate_remaining_time(self, time_so_far, work_so_far, work_total):
@@ -884,7 +889,10 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
         (step, ave_loss, curr_loss, num_so_far,
          effective_epochs) = mpi_model.train_epoch()
         e = e_old + effective_epochs
+        g.write_unique('Finished training of epoch {:.2f}/{}\n'.format(
+            e, num_epochs))
 
+        # TODO(KGF): add diagnostic about "saving to epoch X"?
         loader.verbose = False  # True during the first iteration
         if g.task_index == 0:
             specific_builder.save_model_weights(train_model, int(round(e)))