@@ -410,9 +410,6 @@ def train(self):
410410 self .interrupted = True
411411 self .on_keyboard_interrupt ()
412412
413- for proc in self .interactive_ddp_procs :
414- subprocess .Popen .kill (proc )
415-
416413 self .run_training_teardown ()
417414
418415 def prepare_train_loop_dataloader (self , train_dataloader ):
@@ -853,9 +850,7 @@ def run_training_teardown(self):
853850 if hasattr (self , '_teardown_already_run' ) and self ._teardown_already_run :
854851 return
855852
856- # clean up dist group
857- if self .use_ddp or self .use_ddp2 :
858- torch_distrib .destroy_process_group ()
853+ self ._teardown_already_run = True
859854
860855 # Train end events
861856 with self .profiler .profile ('on_train_end' ):
@@ -869,8 +864,16 @@ def run_training_teardown(self):
869864 self .logger .finalize ("success" )
870865
871866 # summarize profile results
872- self .profiler .describe ()
873- self ._teardown_already_run = True
867+ if self .global_rank == 0 :
868+ self .profiler .describe ()
869+
870+ if self .global_rank == 0 :
871+ for proc in self .interactive_ddp_procs :
872+ subprocess .Popen .kill (proc )
873+
874+ # clean up dist group
875+ if self .use_ddp or self .use_ddp2 :
876+ torch_distrib .destroy_process_group ()
874877
875878 def training_forward (self , batch , batch_idx , opt_idx , hiddens ):
876879 """
0 commit comments