@@ -124,14 +124,19 @@ def cleanup():
124124 dist .destroy_process_group ()
125125
126126
127- def _try_init_process_group (local_rank : int , world_size : int , port : int ) -> bool :
128- """Attempt to initialize process group. Returns True on success, False on EADDRINUSE ."""
127+ def _set_distributed_env_vars (local_rank : int , world_size : int , port : int ) -> None :
128+ """Set environment variables required by NCCL's env:// init method ."""
129129 os .environ ["RANK" ] = str (local_rank )
130130 os .environ ["WORLD_SIZE" ] = str (world_size )
131131 os .environ ["MASTER_ADDR" ] = "127.0.0.1"
132132 os .environ ["MASTER_PORT" ] = str (port )
133133 os .environ ["LOCAL_RANK" ] = str (local_rank )
134134
135+
136+ def _try_init_process_group (local_rank : int , world_size : int , port : int ) -> bool :
137+ """Attempt to initialize process group. Returns True on success, False on EADDRINUSE."""
138+ _set_distributed_env_vars (local_rank , world_size , port )
139+
135140 try :
136141 dist .init_process_group (
137142 "nccl" ,
@@ -216,6 +221,7 @@ def initialize(
216221 if port == - 1 :
217222 raise RuntimeError ("Rank 0 failed to initialize, cannot proceed" )
218223 ad_logger .info (f"Initializing for: { lib = } , { local_rank = } , { world_size = } , { port = } " )
224+ _set_distributed_env_vars (local_rank , world_size , port )
219225 dist .init_process_group (
220226 "nccl" ,
221227 world_size = world_size ,
@@ -225,6 +231,7 @@ def initialize(
225231 else :
226232 # Original path: no retry mechanism (OMPI, torchelastic, or single process)
227233 ad_logger .info (f"Initializing for: { lib = } , { local_rank = } , { world_size = } , { port = } " )
234+ _set_distributed_env_vars (local_rank , world_size , port )
228235 dist .init_process_group (
229236 "nccl" ,
230237 world_size = world_size ,
0 commit comments