File tree Expand file tree Collapse file tree 2 files changed +14
-2
lines changed
Expand file tree Collapse file tree 2 files changed +14
-2
lines changed Original file line number Diff line number Diff line change @@ -12,7 +12,11 @@ commands:
1212 FIFO=/tmp/dstack_job
1313 if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
1414 cd /root/nccl-tests/build
15- echo "${DSTACK_NODES_IPS}" > hostfile
15+ # Generate hostfile for mpirun
16+ : > hostfile
17+ for ip in ${DSTACK_NODES_IPS}; do
18+ echo "${ip} slots=${DSTACK_GPUS_PER_NODE}" >> hostfile
19+ done
1620 MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
1721 # Wait for other nodes
1822 while true; do
@@ -25,6 +29,8 @@ commands:
2529 # Run NCCL Tests
2630 ${MPIRUN} \
2731 -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
32+ --mca pml ^cm \
33+ --mca btl tcp,self \
2834 --mca btl_tcp_if_exclude lo,docker0 \
2935 --bind-to none \
3036 ./all_reduce_perf -b 8 -e 8G -f 2 -g 1
Original file line number Diff line number Diff line change @@ -23,7 +23,11 @@ commands:
2323 FIFO=/tmp/dstack_job
2424 if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
2525 cd /root/nccl-tests/build
26- echo "${DSTACK_NODES_IPS}" > hostfile
26+ # Generate hostfile for mpirun
27+ : > hostfile
28+ for ip in ${DSTACK_NODES_IPS}; do
29+ echo "${ip} slots=${DSTACK_GPUS_PER_NODE}" >> hostfile
30+ done
2731 MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
2832 # Wait for other nodes
2933 while true; do
@@ -36,6 +40,8 @@ commands:
3640 # Run NCCL Tests
3741 ${MPIRUN} \
3842 -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
43+ --mca pml ^cm \
44+ --mca btl tcp,self \
3945 --mca btl_tcp_if_exclude lo,docker0 \
4046 --bind-to none \
4147 ./all_reduce_perf -b 8 -e 8G -f 2 -g 1
You can’t perform that action at this time.
0 commit comments