Skip to content

Commit f3221a5

Browse files
Update multi_node_cluster_auto_slurm.py
1 parent fe17d14 commit f3221a5

File tree

1 file changed

+13
-5
lines changed

1 file changed

+13
-5
lines changed

examples/new_project_templates/multi_node_examples/multi_node_cluster_auto_slurm.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -115,11 +115,19 @@ def optimize_on_cluster(hyperparams):
115115
# set DDP master port
116116
cluster.add_command(f'export MASTER_PORT={PORT}')
117117

118-
# YOU MIGHT NEED THESE
119-
# cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
120-
# cluster.add_command('export NCCL_DEBUG=INFO')
121-
# cluster.add_command('export PYTHONFAULTHANDLER=1')
122-
# cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
118+
# OPTIONAL for debugging
119+
# without these flags errors in your code will
120+
# appear to be nccl errors
121+
cluster.add_command('export NCCL_DEBUG=INFO')
122+
cluster.add_command('export PYTHONFAULTHANDLER=1')
123+
124+
# depending on your cluster config, you probably want
125+
# to limit the wired connection device
126+
# cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
127+
128+
# depending on your cluster, you might need to load
129+
# the latest NCCL version
130+
# cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
123131

124132
# run only on 32GB voltas
125133
cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',

0 commit comments

Comments
 (0)