Skip to content

Commit 280fa3c

Browse files
Add OCI tuner
1 parent 79c3cdb commit 280fa3c

File tree

8 files changed

+180
-10
lines changed

8 files changed

+180
-10
lines changed

playbooks/new_nodes.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,6 @@
5454
when: cluster_network|bool and not use_compute_agent|default(false)|bool
5555
- include_role:
5656
name: nvidia_peermem
57-
- include_role:
58-
name: nccl-conf
59-
when: cluster_network|bool
6057

6158
- hosts: controller,slurm_backup,login,compute
6259
become: true
@@ -174,6 +171,9 @@
174171
when: enroot|default(true)|bool
175172
- include_role:
176173
name: tuned
174+
- include_role:
175+
name: nccl-conf
176+
when: cluster_network|bool
177177

178178
- hosts: compute
179179
tasks:

playbooks/resize_add.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,6 @@
5252
when: cluster_network|bool and not use_compute_agent|default(false)|bool
5353
- include_role:
5454
name: nvidia_peermem
55-
- include_role:
56-
name: nccl-conf
57-
when: cluster_network|bool
5855

5956
- hosts: controller,slurm_backup,login,compute
6057
become: true
@@ -176,6 +173,9 @@
176173
when: enroot|default(true)|bool
177174
- include_role:
178175
name: tuned
176+
- include_role:
177+
name: nccl-conf
178+
when: cluster_network|bool
179179

180180
- hosts: all
181181
become: true
161 KB
Binary file not shown.
Binary file not shown.

playbooks/roles/nccl-conf/tasks/main.yml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,24 @@
3333
owner: root
3434
group: root
3535
mode: '0644'
36-
when: shape_nccl.stdout == '"BM.GPU4.8"'
36+
when: shape_nccl.stdout == '"BM.GPU4.8"'
37+
38+
- name: copy libnccl-ocituner for OL
39+
become: true
40+
copy:
41+
src: libnccl-ocituner.so.1.0.1_OL
42+
dest: /home/opc/libnccl-ocituner.so.1.0.1
43+
owner: opc
44+
group: privilege
45+
mode: '0775'
46+
when: ( shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"' or shape_nccl.stdout == '"BM.GPU4.8"' ) and ansible_distribution == 'OracleLinux'
47+
48+
- name: copy libnccl-ocituner for Ubuntu
49+
become: true
50+
copy:
51+
src: libnccl-ocituner.so.1.0.1_ubuntu
52+
dest: /home/ubuntu/libnccl-ocituner.so.1.0.1
53+
owner: ubuntu
54+
group: privilege
55+
mode: '0775'
56+
when: ( shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"' or shape_nccl.stdout == '"BM.GPU4.8"' ) and ansible_os_family == 'Debian'

playbooks/site.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,6 @@
6464
when: cluster_network|bool and not use_compute_agent|default(false)|bool
6565
- include_role:
6666
name: nvidia_peermem
67-
- include_role:
68-
name: nccl-conf
69-
when: cluster_network|bool
7067

7168
- hosts: controller
7269
become: true
@@ -264,6 +261,9 @@
264261
- include_role:
265262
name: hyperthreading
266263
when: not hyperthreading|default(false)|bool
264+
- include_role:
265+
name: nccl-conf
266+
when: cluster_network|bool
267267

268268
- hosts: all
269269
tasks:
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-slurm
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
10+
cd /nfs/cluster
11+
mkdir $SLURM_JOB_ID
12+
cd $SLURM_JOB_ID
13+
14+
MACHINEFILE="hostfile"
15+
ORDEREDMACHINEFILE="ordered_hostfile_system_name"
16+
ORDEREDRANKMACHINEFILE="rankfile_system_name"
17+
18+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
19+
echo MACHINEFILE
20+
cat $MACHINEFILE
21+
22+
source /etc/os-release
23+
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
24+
python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
25+
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
26+
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
27+
fi
28+
29+
30+
echo ORDEREDMACHINEFILE
31+
cat $ORDEREDMACHINEFILE
32+
echo ORDEREDRANKMACHINEFILE
33+
cat $ORDEREDRANKMACHINEFILE
34+
35+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
36+
37+
if [[ "$mpivars_path" == "" ]]; then
38+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
39+
fi
40+
41+
if [[ "$mpivars_path" == "" ]]; then
42+
echo "Could not find MPIPATH"; exit; fi
43+
44+
source $mpivars_path
45+
46+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
47+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
48+
then
49+
var_UCX_NET_DEVICES=mlx5_0:1
50+
elif [ $shape == \"BM.GPU4.8\" ]
51+
then
52+
var_UCX_NET_DEVICES=mlx5_4:1
53+
fi
54+
55+
mpirun --mca pml ucx \
56+
--bind-to numa \
57+
--mca coll ^hcoll \
58+
-x UCX_TLS=ud,self,sm \
59+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
60+
-x HCOLL_ENABLE_MCAST_ALL=0 \
61+
-x coll_hcoll_enable=0 \
62+
-x NCCL_TUNER_PLUGIN=$homedirectory/libnccl-ocituner.so.1.0.1 \
63+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter
5+
max=$1
6+
7+
# This assume, the hostfile passed is already ordered based on their rackId
8+
if [ -n "$2" ]; then
9+
hostfile=$2
10+
else
11+
hostfile="/tmp/ordered_hostfile_system_name"
12+
fi
13+
14+
ORDEREDMACHINEFILE="ordered_hostfile_system_name"
15+
ORDEREDRANKMACHINEFILE="rankfile_system_name"
16+
echo INPUTFILE
17+
cat $hostfile
18+
19+
# will generate rack-aware ordered host file
20+
source /etc/os-release
21+
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
22+
python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
23+
homedirectory=/home/opc
24+
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
25+
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
26+
homedirectory=/home/ubuntu
27+
fi
28+
29+
hostfile=$ORDEREDMACHINEFILE
30+
rankfile=$ORDEREDRANKMACHINEFILE
31+
32+
echo ORDEREDMACHINEFILE
33+
cat $ORDEREDMACHINEFILE
34+
echo ORDEREDRANKMACHINEFILE
35+
cat $ORDEREDRANKMACHINEFILE
36+
37+
# The number of GPUs to use for the test. Has to be multiplier of 8. If not passed, all GPUs will be used.
38+
if [ -n "$3" ]; then
39+
np=$3
40+
else
41+
np=$((`less $hostfile | wc -l` * 8 ))
42+
fi
43+
44+
logfile="nccl_run_allreduce.sh.log"
45+
46+
for x in $(seq 1 1 $max)
47+
do
48+
49+
echo $x
50+
echo $x >> $logfile
51+
date >> $logfile
52+
53+
rankfile=$rankfile; np=$np ; iter=20;
54+
55+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
56+
source $mpivars_path
57+
58+
if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi
59+
60+
first_node=`head $hostfile -n 1`
61+
shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
62+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
63+
then
64+
var_UCX_NET_DEVICES=mlx5_0:1
65+
elif [ $shape == \"BM.GPU4.8\" ]
66+
then
67+
var_UCX_NET_DEVICES=mlx5_4:1
68+
fi
69+
70+
# final version
71+
# all NCCL parameters are at /etc/nccl.conf on each compute node.
72+
mpirun --mca pml ucx \
73+
--bind-to numa \
74+
--mca coll ^hcoll \
75+
-x UCX_TLS=ud,self,sm \
76+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
77+
-x HCOLL_ENABLE_MCAST_ALL=0 \
78+
-x coll_hcoll_enable=0 \
79+
-x NCCL_TUNER_PLUGIN=$homedirectory/libnccl-ocituner.so.1.0.1 \
80+
--np $np --rankfile $rankfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >> $logfile
81+
82+
tail -n 32 $logfile
83+
84+
85+
done
86+
87+

0 commit comments

Comments
 (0)