File tree Expand file tree Collapse file tree 1 file changed +10
-0
lines changed
examples/new_project_templates/multi_node_examples Expand file tree Collapse file tree 1 file changed +10
-0
lines changed Original file line number Diff line number Diff line change 1212
1313from examples .new_project_templates .lightning_module_template import LightningTemplateModel
1414
15+ PORT = np .random .randint (12000 , 20000 , 1 )[0 ]
1516SEED = 2334
1617torch .manual_seed (SEED )
1718np .random .seed (SEED )
@@ -111,6 +112,15 @@ def optimize_on_cluster(hyperparams):
111112 # any modules for code to run in env
112113 cluster .add_command (f'source activate { hyperparams .conda_env } ' )
113114
115+ # set DDP master port
116+ cluster .add_command (f'export MASTER_PORT={ PORT } ' )
117+
118+ # YOU MIGHT NEED THESE
119+ # cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
120+ # cluster.add_command('export NCCL_DEBUG=INFO')
121+ # cluster.add_command('export PYTHONFAULTHANDLER=1')
122+ # cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
123+
114124 # run only on 32GB voltas
115125 cluster .add_slurm_cmd (cmd = 'constraint' , value = 'volta32gb' ,
116126 comment = 'use 32gb gpus' )
You can’t perform that action at this time.
0 commit comments