Skip to content

Commit 9d0b83f

Browse files
authored
[Examples] Update nccl-tests (#2451)
* Generate hostfile with `slots` option * Add MCA parameters from https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html#nccl-start-base-test
1 parent af7887e commit 9d0b83f

File tree

2 files changed

+14
-2
lines changed

2 files changed

+14
-2
lines changed

examples/misc/nccl-tests/.dstack.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@ commands:
1212
FIFO=/tmp/dstack_job
1313
if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
1414
cd /root/nccl-tests/build
15-
echo "${DSTACK_NODES_IPS}" > hostfile
15+
# Generate hostfile for mpirun
16+
: > hostfile
17+
for ip in ${DSTACK_NODES_IPS}; do
18+
echo "${ip} slots=${DSTACK_GPUS_PER_NODE}" >> hostfile
19+
done
1620
MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
1721
# Wait for other nodes
1822
while true; do
@@ -25,6 +29,8 @@ commands:
2529
# Run NCCL Tests
2630
${MPIRUN} \
2731
-n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
32+
--mca pml ^cm \
33+
--mca btl tcp,self \
2834
--mca btl_tcp_if_exclude lo,docker0 \
2935
--bind-to none \
3036
./all_reduce_perf -b 8 -e 8G -f 2 -g 1

examples/misc/nccl-tests/README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,11 @@ commands:
2323
FIFO=/tmp/dstack_job
2424
if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
2525
cd /root/nccl-tests/build
26-
echo "${DSTACK_NODES_IPS}" > hostfile
26+
# Generate hostfile for mpirun
27+
: > hostfile
28+
for ip in ${DSTACK_NODES_IPS}; do
29+
echo "${ip} slots=${DSTACK_GPUS_PER_NODE}" >> hostfile
30+
done
2731
MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
2832
# Wait for other nodes
2933
while true; do
@@ -36,6 +40,8 @@ commands:
3640
# Run NCCL Tests
3741
${MPIRUN} \
3842
-n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
43+
--mca pml ^cm \
44+
--mca btl tcp,self \
3945
--mca btl_tcp_if_exclude lo,docker0 \
4046
--bind-to none \
4147
./all_reduce_perf -b 8 -e 8G -f 2 -g 1

0 commit comments

Comments
 (0)