File tree Expand file tree Collapse file tree 1 file changed +13
-5
lines changed
examples/new_project_templates/multi_node_examples Expand file tree Collapse file tree 1 file changed +13
-5
lines changed Original file line number Diff line number Diff line change @@ -115,11 +115,19 @@ def optimize_on_cluster(hyperparams):
115115 # set DDP master port
116116 cluster .add_command (f'export MASTER_PORT={ PORT } ' )
117117
118- # YOU MIGHT NEED THESE
119- # cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
120- # cluster.add_command('export NCCL_DEBUG=INFO')
121- # cluster.add_command('export PYTHONFAULTHANDLER=1')
122- # cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
118+ # OPTIONAL for debugging
119+ # without these flags errors in your code will
120+ # appear to be nccl errors
121+ cluster .add_command ('export NCCL_DEBUG=INFO' )
122+ cluster .add_command ('export PYTHONFAULTHANDLER=1' )
123+
124+ # depending on your cluster config, you probably want
125+ # to limit the wired connection device
126+ # cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
127+
128+ # depending on your cluster, you might need to load
129+ # the latest NCCL version
130+ # cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
123131
124132 # run only on 32GB voltas
125133 cluster .add_slurm_cmd (cmd = 'constraint' , value = 'volta32gb' ,
You can’t perform that action at this time.
0 commit comments