File tree Expand file tree Collapse file tree 1 file changed +8
-0
lines changed Expand file tree Collapse file tree 1 file changed +8
-0
lines changed Original file line number Diff line number Diff line change @@ -587,6 +587,14 @@ def _helper_test_extra_cuda_context_by_memory(self):
587587 """
588588 device = torch .device ("cuda:%d" % self .rank )
589589 x = torch .empty ((1 ,), device = device )
590+
591+ # We need this barrier to ensure that all nodes have completed init_process_group
592+ # If rank=0 gets a mem snapshot before other nodes have finished init_process_group,
593+ # then we artificially see a bump in memory usage. As per the following comment,
594+ # we are going to be moving away from this function:
595+ # https://github.com/pytorch/pytorch/pull/154174#discussion_r2105065931
596+ c10d .barrier ()
597+
590598 # Rank 0 takes a snapshot before collective -- this snapshot should have
591599 # included rank 0's own context.
592600 if self .rank == 0 :
You can’t perform that action at this time.
0 commit comments