1+ #! /bin/bash
2+ set -e
3+
4+ # number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter
5+ max=$1
6+
7+ # This assumes that the hostfile passed is already ordered based on their rackId or slurm 23.02 and higher will order it based on topology
8+ if [ -n " $2 " ]; then
9+ hostfile=$2
10+ else
11+ hostfile=" /tmp/ordered_hostfile_system_name"
12+ fi
13+
14+ echo INPUTFILE
15+ cat $hostfile
16+
17+ if [ -n " $3 " ]; then
18+ logfile=$3
19+ else
20+ logfile=" nccl_run_allreduce_srun.sh.log"
21+ fi
22+
23+ echo $logfile
24+
25+ for x in $( seq 1 1 $max )
26+ do
27+
28+ echo $x
29+ echo $x >> $logfile
30+ date >> $logfile
31+
32+ hostfile=$hostfile
33+
34+ mpivars_path=` ls /usr/mpi/gcc/openmpi-* /bin/mpivars.sh`
35+
36+ if [[ " $mpivars_path " == " " ]]; then
37+ mpivars_path=` ls /opt/openmpi-* /bin/mpivars.sh`
38+ fi
39+
40+ if [[ " $mpivars_path " == " " ]]; then
41+ echo " Could not find MPIPATH" ; exit ; fi
42+
43+ source $mpivars_path
44+ echo $mpivars_path
45+
46+ USER=` whoami`
47+
48+ first_node=` head $hostfile -n 1`
49+ shape=` ssh $first_node ' curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
50+ if [ $shape == \" BM.GPU.B4.8\" ] || [ $shape == \" BM.GPU.A100-v2.8\" ]
51+ then
52+ var_UCX_NET_DEVICES=mlx5_0:1
53+ var_NCCL_IB_HCA=" =mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
54+ elif [ $shape == \" BM.GPU4.8\" ]
55+ then
56+ var_UCX_NET_DEVICES=mlx5_4:1
57+ var_NCCL_IB_HCA=" =mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
58+ fi
59+
60+ export NCCL_DEBUG=WARN \
61+ OMPI_MCA_coll=^hcoll \
62+ RX_QUEUE_LEN=8192 \
63+ IB_RX_QUEUE_LEN=8192 \
64+ NCCL_IGNORE_CPU_AFFINITY=1 \
65+ NCCL_IB_SL=0 \
66+ NCCL_IB_TC=41 \
67+ NCCL_IB_QPS_PER_CONNECTION=4 \
68+ UCX_TLS=ud,self,sm \
69+ UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
70+ HCOLL_ENABLE_MCAST_ALL=0 \
71+ coll_hcoll_enable=0 \
72+ NCCL_IB_GID_INDEX=3 \
73+ NCCL_ALGO=Ring \
74+ NCCL_IB_HCA=" ${var_NCCL_IB_HCA} "
75+ srun --mpi=pmix_v3 --nodefile=$hostfile --gpus-per-node=8 --ntasks-per-node=8 /home/$USER /nccl-tests/build/all_reduce_perf -b1G -e10G -i$(( 1024 * 1024 * 1024 * 9 )) -n 100 >> $logfile
76+
77+
78+
79+ tail -n 32 $logfile
80+
81+
82+ done
0 commit comments