Skip to content

Commit fe17d14

Browse files
Update multi_node_cluster_auto_slurm.py
1 parent 9576dd2 commit fe17d14

File tree

1 file changed

+10
-0
lines changed

1 file changed

+10
-0
lines changed

examples/new_project_templates/multi_node_examples/multi_node_cluster_auto_slurm.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from examples.new_project_templates.lightning_module_template import LightningTemplateModel
1414

15+
PORT = np.random.randint(12000, 20000, 1)[0]
1516
SEED = 2334
1617
torch.manual_seed(SEED)
1718
np.random.seed(SEED)
@@ -111,6 +112,15 @@ def optimize_on_cluster(hyperparams):
111112
# any modules for code to run in env
112113
cluster.add_command(f'source activate {hyperparams.conda_env}')
113114

115+
# set DDP master port
116+
cluster.add_command(f'export MASTER_PORT={PORT}')
117+
118+
# YOU MIGHT NEED THESE
119+
# cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
120+
# cluster.add_command('export NCCL_DEBUG=INFO')
121+
# cluster.add_command('export PYTHONFAULTHANDLER=1')
122+
# cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
123+
114124
# run only on 32GB voltas
115125
cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
116126
comment='use 32gb gpus')

0 commit comments

Comments
 (0)