Skip to content

Commit 29f6cd6

Browse files
committed
verify ddp training
1 parent 852dc56 commit 29f6cd6

File tree

3 files changed

+11
-3
lines changed

3 files changed

+11
-3
lines changed

ddp_script.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
export CUDA_VISIBLE_DEVICES=0,...,N_GPUS
2+
export NCCL_DEBUG=INFO
3+
export NCCL_SOCKET_IFNAME=^docker0,lo
4+
export MASTER_ADDR="MASTER_IP"
5+
export MASTER_PORT="MASTER_PORT"
6+
7+
python3 src/main.py -t -e -rm_API -c CONFIG_PATH -DDP -n NUM_NODES -nr CURRENT_NODES -eval_type EVAL_TYPE ...

src/main.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,6 @@ def main():
123123

124124
if train_config['distributed_data_parallel'] and world_size > 1:
125125
print("Train the models through DistributedDataParallel (DDP) mode.")
126-
os.environ['MASTER_ADDR'] = 'localhost'
127-
os.environ['MASTER_PORT'] = '2222'
128126
mp.spawn(prepare_train_eval, nprocs=gpus_per_node, args=(gpus_per_node, world_size, run_name,
129127
train_config, model_config, hdf5_path_train))
130128
else:

src/utils/misc.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,10 @@ def setup(rank, world_size, backend="nccl"):
133133
)
134134
else:
135135
# initialize the process group
136-
dist.init_process_group(backend, rank=rank, world_size=world_size)
136+
dist.init_process_group(backend,
137+
init_method="tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']),
138+
rank=rank,
139+
world_size=world_size)
137140

138141

139142
def cleanup():

0 commit comments

Comments
 (0)