Skip to content

Commit 9d45b3a

Browse files
committed
address CR comments
Signed-off-by: Ajay Mishra <ajmishra@nvidia.com>
1 parent d55cc53 commit 9d45b3a

File tree

2 files changed

+16
-16
lines changed

2 files changed

+16
-16
lines changed

preflight-checks/nccl-allreduce/nccl_allreduce/__main__.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,9 @@ def run() -> int:
9494
log.error("Configuration error", extra={"error": str(err)})
9595
return NCCLError.GANG_CONFIG_ERROR.value.exit_code
9696

97-
# Set NCCL timeout environment variables if not already set
98-
# This helps fail faster if there are network connectivity issues
99-
if "NCCL_TIMEOUT" not in os.environ:
100-
os.environ["NCCL_TIMEOUT"] = "1800" # 30 minutes default
101-
if "NCCL_ASYNC_ERROR_HANDLING" not in os.environ:
102-
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
97+
# Set NCCL defaults if not already set by the container env.
10398
if "NCCL_DEBUG" not in os.environ:
104-
os.environ["NCCL_DEBUG"] = "INFO" # Enable NCCL debug logging
99+
os.environ["NCCL_DEBUG"] = "INFO"
105100

106101
try:
107102
log.info("Initializing NCCL process group", extra={"backend": "nccl"})

preflight-checks/nccl-allreduce/scripts/entrypoint.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ def main() -> int:
9090

9191
# 2. Wait for gang formation and validate
9292
gang_config = _wait_for_gang(cfg)
93-
if gang_config is None:
94-
return NCCLError.GANG_CONFIG_ERROR.value.exit_code
93+
if isinstance(gang_config, int):
94+
return gang_config
9595

9696
# 3. Launch torchrun (replaces this process)
9797
_launch_torchrun(gang_config, cfg.nprocs_per_node)
@@ -143,11 +143,11 @@ def _load_config() -> _EntrypointConfig | None:
143143
)
144144

145145

146-
def _wait_for_gang(cfg: _EntrypointConfig) -> GangConfig | None:
146+
def _wait_for_gang(cfg: _EntrypointConfig) -> GangConfig | int:
147147
"""Wait for gang formation and validate the resulting configuration.
148148
149149
Returns:
150-
The gang configuration on success, or None on failure (error already reported).
150+
The gang configuration on success, or an NCCLError exit code on failure.
151151
"""
152152
waiter = GangWaiter(cfg.gang_config_dir)
153153

@@ -156,23 +156,23 @@ def _wait_for_gang(cfg: _EntrypointConfig) -> GangConfig | None:
156156
except TimeoutError as err:
157157
log.error("Gang formation timeout", extra={"error": str(err)})
158158
_report_error(NCCLError.GANG_TIMEOUT, str(err))
159-
return None
159+
return NCCLError.GANG_TIMEOUT.value.exit_code
160160
except ValueError as err:
161161
log.error("Invalid gang configuration", extra={"error": str(err)})
162162
_report_error(NCCLError.GANG_CONFIG_ERROR, str(err))
163-
return None
163+
return NCCLError.GANG_CONFIG_ERROR.value.exit_code
164164

165165
if gang_config.my_rank < 0:
166166
error_msg = f"Pod {cfg.pod_name} not found in peers list"
167167
log.error(error_msg)
168168
_report_error(NCCLError.GANG_CONFIG_ERROR, error_msg)
169-
return None
169+
return NCCLError.GANG_CONFIG_ERROR.value.exit_code
170170

171171
if not gang_config.master_addr:
172172
error_msg = "Master address not set in ConfigMap"
173173
log.error(error_msg)
174174
_report_error(NCCLError.GANG_CONFIG_ERROR, error_msg)
175-
return None
175+
return NCCLError.GANG_CONFIG_ERROR.value.exit_code
176176

177177
return gang_config
178178

@@ -202,7 +202,12 @@ def _launch_torchrun(gang_config: GangConfig, nprocs_per_node: int) -> None:
202202

203203
os.environ["NPROCS_PER_NODE"] = str(nprocs_per_node)
204204

205-
os.execvp(cmd[0], cmd)
205+
try:
206+
os.execvp(cmd[0], cmd)
207+
except OSError as err:
208+
log.error("Failed to exec torchrun", extra={"command": cmd[0], "error": str(err)})
209+
_report_error(NCCLError.GANG_CONFIG_ERROR, f"Failed to exec {cmd[0]}: {err}")
210+
sys.exit(NCCLError.GANG_CONFIG_ERROR.value.exit_code)
206211

207212

208213
def _detect_gpu_count() -> int:

0 commit comments

Comments
 (0)