Skip to content

Commit e8ce6f6

Browse files
authored
[bug] Fix oumi distributed on Slurm to use correct node rank env var (#2020)
1 parent 85936ae commit e8ce6f6

File tree

2 files changed

+4
-6
lines changed

2 files changed

+4
-6
lines changed

src/oumi/cli/distributed_run.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -422,17 +422,15 @@ def _detect_slurm_process_run_info(env: dict[str, str]) -> Optional[_ProcessRunI
422422
if len(node_ips) == 0:
423423
raise RuntimeError("Empty list of nodes in 'SLURM_NODELIST'!")
424424
gpus_per_node = torch.cuda.device_count()
425-
node_rank = _get_optional_int_env_var("SLURM_PROCID", env)
426-
if node_rank is None:
427-
node_rank = _get_optional_int_env_var("PMI_RANK", env)
428425

426+
node_rank = _get_optional_int_env_var("SLURM_NODEID", env)
429427
# If running on a single node, default to 0.
430428
if node_rank is None and len(node_ips) == 1:
431429
node_rank = 0
432430
if node_rank is None:
433431
raise ValueError(
434432
"Unable to determine node rank on a multi-node setup. "
435-
"Neither 'SLURM_PROCID' nor 'PMI_RANK' is set "
433+
"'SLURM_NODEID' is not set."
436434
)
437435

438436
return _ProcessRunInfo(

tests/unit/cli/test_cli_distributed_run.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def test_torchrun_polaris_multi_gpu(
229229
assert logger.level == logging.DEBUG
230230

231231

232-
def test_torchrun_frontier_multi_gpu(
232+
def test_torchrun_slurm_multi_gpu(
233233
app,
234234
mock_os,
235235
mock_popen,
@@ -239,7 +239,7 @@ def test_torchrun_frontier_multi_gpu(
239239
):
240240
test_env_vars = {
241241
"SLURM_NODELIST": "frontier[04316-04317]",
242-
"PMI_RANK": 1,
242+
"SLURM_NODEID": 1,
243243
"SLURM_JOBID": "123456.frontier",
244244
# Define the redundant OUMI_ variables to activate consistency checks.
245245
"OUMI_TOTAL_NUM_GPUS": 16,

0 commit comments

Comments
 (0)