Skip to content

Commit 7ce6a2b

Browse files
authored
bugfixed: stuck when training with dist_train.sh, support tcp_port (#784)
1 parent 274c90c commit 7ce6a2b

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

pcdet/utils/common_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,11 @@ def init_dist_slurm(tcp_port, local_rank, backend='nccl'):
161161
def init_dist_pytorch(tcp_port, local_rank, backend='nccl'):
162162
if mp.get_start_method(allow_none=True) is None:
163163
mp.set_start_method('spawn')
164-
164+
os.environ['MASTER_PORT'] = str(tcp_port)
165+
os.environ['MASTER_ADDR'] = 'localhost'
165166
num_gpus = torch.cuda.device_count()
166167
torch.cuda.set_device(local_rank % num_gpus)
168+
167169
dist.init_process_group(
168170
backend=backend,
169171
# init_method='tcp://127.0.0.1:%d' % tcp_port,

0 commit comments

Comments
 (0)