Skip to content

Commit 3b2b91a

Browse files
authored
Fixes for multi-node execution with torchrun + LocalExecutor (#251)
- do prepare stage only from single process or rank - for --node-rank, also look for SLURM_NODEID Signed-off-by: Pramod Kumbhar <[email protected]>
1 parent a61734b commit 3b2b91a

File tree

2 files changed

+7
-1
lines changed

2 files changed

+7
-1
lines changed

nemo_run/run/experiment.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -665,7 +665,10 @@ def run(
665665
return
666666

667667
# Prepare experiment before running
668-
self._prepare()
668+
669+
# in case of multi-node execution with LocalExecutor+torchrun+slurm, run only on first rank
670+
if int(os.getenv("SLURM_PROCID", 0)) == 0:
671+
self._prepare()
669672

670673
if direct:
671674
self.console.log(

nemo_run/run/torchx_backend/components/torchrun.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,11 @@ def torchrun(
128128
num_nodes = nnodes_rep
129129
nproc_per_node = str(nproc_per_node)
130130

131+
# set node rank to relative node id in the current allocation
131132
if use_env and os.getenv("NODE_RANK"):
132133
node_rank = os.environ["NODE_RANK"]
134+
elif use_env and os.getenv("SLURM_NODEID"):
135+
node_rank = os.environ["SLURM_NODEID"]
133136
else:
134137
node_rank = torchx_dist._noquote(f"$${ExecutorMacros.NODE_RANK_VAR}")
135138

0 commit comments

Comments
 (0)