Skip to content

Commit 57adf1e

Browse files
Add Tuner example
1 parent 9ba7480 commit 57adf1e

File tree

2 files changed

+191
-0
lines changed

2 files changed

+191
-0
lines changed
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-slurm
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
10+
cd /nfs/scratch
11+
mkdir $SLURM_JOB_ID
12+
cd $SLURM_JOB_ID
13+
14+
MACHINEFILE="hostfile"
15+
ORDEREDMACHINEFILE="ordered_hostfile_system_name"
16+
ORDEREDRANKMACHINEFILE="rankfile_system_name"
17+
18+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
19+
echo MACHINEFILE
20+
cat $MACHINEFILE
21+
22+
source /etc/os-release
23+
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
24+
python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
25+
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
26+
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
27+
fi
28+
29+
30+
echo ORDEREDMACHINEFILE
31+
cat $ORDEREDMACHINEFILE
32+
echo ORDEREDRANKMACHINEFILE
33+
cat $ORDEREDRANKMACHINEFILE
34+
35+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
36+
37+
if [[ "$mpivars_path" == "" ]]; then
38+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
39+
fi
40+
41+
if [[ "$mpivars_path" == "" ]]; then
42+
echo "Could not find MPIPATH"; exit; fi
43+
44+
source $mpivars_path
45+
46+
export NCCL_DEBUG=WARN
47+
48+
#mpirun -d --mca pml ucx -x SLURM_JOB_NODELIST=$host_list --bind-to numa -x NCCL_DEBUG=WARN -x NCCL_IB_SL=0 -x NCCL_IB_TC=41 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_GID_INDEX=3 -x NCCL_ALGO=Ring -x NCCL_TOPO_FILE=/home/opc/topo-flattened-b4.xml -x NCCL_IB_HCA="mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_16,mlx5_17,mlx5_18,mlx5_19" -x UCX_NET_DEVICES=mlx5_0:1 -x HCOLL_ENABLE_MCAST_ALL=0 -x coll_hcoll_enable=0 -x UCX_TLS=ud,self,sm -np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile rankfile_system_name /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
49+
# no need to pass: -x SLURM_JOB_NODELIST=$host_list
50+
51+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
52+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
53+
then
54+
var_UCX_NET_DEVICES=mlx5_0:1
55+
var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
56+
elif [ $shape == \"BM.GPU4.8\" ]
57+
then
58+
var_UCX_NET_DEVICES=mlx5_4:1
59+
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
60+
fi
61+
62+
NCCL_version=`sudo ldconfig -v 2>&1 | grep "libnccl.so" | tail -n1 | sed -r 's/^.*\.so\.//'`
63+
arr_NCCL=(${NCCL_version//./ })
64+
if [ ${arr_NCCL[2]} > 20 ]
65+
then
66+
tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.2.0.1
67+
else
68+
tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.1.0.2
69+
fi
70+
71+
72+
mpirun --mca pml ucx \
73+
--bind-to numa \
74+
--mca coll ^hcoll \
75+
-x NCCL_DEBUG=WARN \
76+
-x NCCL_IB_SL=0 \
77+
-x NCCL_IB_TC=41 \
78+
-x NCCL_IB_QPS_PER_CONNECTION=4 \
79+
-x UCX_TLS=ud,self,sm \
80+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
81+
-x HCOLL_ENABLE_MCAST_ALL=0 \
82+
-x coll_hcoll_enable=0 \
83+
-x NCCL_IB_GID_INDEX=3 \
84+
-x NCCL_TUNER_PLUGIN=${tuner_path} \
85+
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
86+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
87+
88+
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter
5+
max=$1
6+
7+
# This assume, the hostfile passed is already ordered based on their rackId
8+
if [ -n "$2" ]; then
9+
hostfile=$2
10+
else
11+
#hostfile="/home/opc/hostfile.tcp"
12+
#hostfile="/etc/opt/oci-hpc/hostfile.tcp"
13+
hostfile="/tmp/ordered_hostfile_system_name"
14+
fi
15+
16+
ORDEREDMACHINEFILE="ordered_hostfile_system_name"
17+
ORDEREDRANKMACHINEFILE="rankfile_system_name"
18+
echo INPUTFILE
19+
cat $hostfile
20+
21+
# will generate rack-aware ordered host file
22+
source /etc/os-release
23+
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
24+
python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
25+
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
26+
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
27+
fi
28+
29+
hostfile=$ORDEREDMACHINEFILE
30+
rankfile=$ORDEREDRANKMACHINEFILE
31+
32+
echo ORDEREDMACHINEFILE
33+
cat $ORDEREDMACHINEFILE
34+
echo ORDEREDRANKMACHINEFILE
35+
cat $ORDEREDRANKMACHINEFILE
36+
37+
# The number of GPUs to use for the test. Has to be multiplier of 8. If not passed, all GPUs will be used.
38+
if [ -n "$3" ]; then
39+
np=$3
40+
else
41+
np=$((`less $hostfile | wc -l` * 8 ))
42+
fi
43+
44+
logfile="nccl_run_allreduce.sh.log"
45+
46+
for x in $(seq 1 1 $max)
47+
do
48+
49+
echo $x
50+
echo $x >> $logfile
51+
date >> $logfile
52+
53+
rankfile=$rankfile; np=$np ; iter=20;
54+
55+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
56+
source $mpivars_path
57+
58+
if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi
59+
60+
first_node=`head $hostfile -n 1`
61+
shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
62+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
63+
then
64+
var_UCX_NET_DEVICES=mlx5_0:1
65+
var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
66+
elif [ $shape == \"BM.GPU4.8\" ]
67+
then
68+
var_UCX_NET_DEVICES=mlx5_4:1
69+
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
70+
fi
71+
72+
NCCL_version=`sudo ldconfig -v 2>&1 | grep "libnccl.so" | tail -n1 | sed -r 's/^.*\.so\.//'`
73+
arr_NCCL=(${NCCL_version//./ })
74+
if [ ${arr_NCCL[2]} < 21 ]
75+
then
76+
tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.1.0.2
77+
else
78+
tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.2.0.1
79+
fi
80+
81+
# final version
82+
mpirun --mca pml ucx \
83+
--bind-to numa \
84+
--mca coll ^hcoll \
85+
-x NCCL_DEBUG=WARN \
86+
-x NCCL_IB_SL=0 \
87+
-x NCCL_IB_TC=41 \
88+
-x NCCL_IB_QPS_PER_CONNECTION=4 \
89+
-x UCX_TLS=ud,self,sm \
90+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
91+
-x HCOLL_ENABLE_MCAST_ALL=0 \
92+
-x coll_hcoll_enable=0 \
93+
-x NCCL_IB_GID_INDEX=3 \
94+
-x NCCL_ALGO=Ring \
95+
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
96+
--np $np --rankfile $rankfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >> $logfile
97+
98+
tail -n 32 $logfile
99+
100+
101+
done
102+
103+

0 commit comments

Comments
 (0)