Skip to content

Commit 745b2fa

Browse files
committed
Fixed missing env vars for ranks >0
Signed-off-by: Eran Geva <[email protected]>
1 parent 5b13957 commit 745b2fa

File tree

1 file changed

+9
-2
lines changed
  • tensorrt_llm/_torch/auto_deploy/distributed

1 file changed

+9
-2
lines changed

tensorrt_llm/_torch/auto_deploy/distributed/common.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,14 +124,19 @@ def cleanup():
124124
dist.destroy_process_group()
125125

126126

127-
def _try_init_process_group(local_rank: int, world_size: int, port: int) -> bool:
128-
"""Attempt to initialize process group. Returns True on success, False on EADDRINUSE."""
127+
def _set_distributed_env_vars(local_rank: int, world_size: int, port: int) -> None:
128+
"""Set environment variables required by NCCL's env:// init method."""
129129
os.environ["RANK"] = str(local_rank)
130130
os.environ["WORLD_SIZE"] = str(world_size)
131131
os.environ["MASTER_ADDR"] = "127.0.0.1"
132132
os.environ["MASTER_PORT"] = str(port)
133133
os.environ["LOCAL_RANK"] = str(local_rank)
134134

135+
136+
def _try_init_process_group(local_rank: int, world_size: int, port: int) -> bool:
137+
"""Attempt to initialize process group. Returns True on success, False on EADDRINUSE."""
138+
_set_distributed_env_vars(local_rank, world_size, port)
139+
135140
try:
136141
dist.init_process_group(
137142
"nccl",
@@ -216,6 +221,7 @@ def initialize(
216221
if port == -1:
217222
raise RuntimeError("Rank 0 failed to initialize, cannot proceed")
218223
ad_logger.info(f"Initializing for: {lib=}, {local_rank=}, {world_size=}, {port=}")
224+
_set_distributed_env_vars(local_rank, world_size, port)
219225
dist.init_process_group(
220226
"nccl",
221227
world_size=world_size,
@@ -225,6 +231,7 @@ def initialize(
225231
else:
226232
# Original path: no retry mechanism (OMPI, torchelastic, or single process)
227233
ad_logger.info(f"Initializing for: {lib=}, {local_rank=}, {world_size=}, {port=}")
234+
_set_distributed_env_vars(local_rank, world_size, port)
228235
dist.init_process_group(
229236
"nccl",
230237
world_size=world_size,

0 commit comments

Comments
 (0)