Skip to content

Commit 95645e7

Browse files
committed
updated code so that ansible stdout matches the output shape. added no nccl param scripts.
1 parent 480407f commit 95645e7

File tree

10 files changed

+279
-13
lines changed

10 files changed

+279
-13
lines changed

playbooks/roles/nccl-conf/files/a100_b4.8

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@ NCCL_IB_SL=0
44
NCCL_IB_TC=41
55
NCCL_IB_QPS_PER_CONNECTION=4
66
NCCL_IB_GID_INDEX=3
7-
NCCL_ALGO=Ring
8-
NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
7+
NCCL_IB_HCA==mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12

playbooks/roles/nccl-conf/files/bm.gpu4.8

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@ NCCL_IB_SL=0
44
NCCL_IB_TC=41
55
NCCL_IB_QPS_PER_CONNECTION=4
66
NCCL_IB_GID_INDEX=3
7-
NCCL_ALGO=Ring
8-
NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
7+
NCCL_IB_HCA==mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17

playbooks/roles/nccl-conf/files/h100

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ NCCL_IB_SL=0
1010
NCCL_IB_TIMEOUT=22
1111
NCCL_NET_PLUGIN=none
1212
NCCL_SOCKET_IFNAME=eth0
13-
NCCL_ALGO=auto
1413
NCCL_IGNORE_CPU_AFFINITY=1
15-
NCCL_IB_HCA="=mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17"
14+
NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17
1615
NCCL_TOPO_FILE=/nfs/cluster/H100-topology.xml

playbooks/roles/nccl-conf/tasks/main.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
owner: root
1414
group: root
1515
mode: '0644'
16-
when: shape_nccl.stdout == "BM.GPU.H100.8"
16+
when: shape_nccl.stdout == '"BM.GPU.H100.8"'
1717

1818
- name: copy nccl.conf for BM.GPU.B4.8 and A100-v2.8
1919
become: true
@@ -23,7 +23,7 @@
2323
owner: root
2424
group: root
2525
mode: '0644'
26-
when: shape_nccl.stdout == "BM.GPU.B4.8" or shape_nccl.stdout == "BM.GPU.A100-v2.8"
26+
when: shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"'
2727

2828
- name: copy nccl.conf for BM.GPU4.8
2929
become: true
@@ -33,4 +33,4 @@
3333
owner: root
3434
group: root
3535
mode: '0644'
36-
when: shape_nccl.stdout == "BM.GPU4.8"
36+
when: shape_nccl.stdout == '"BM.GPU4.8"'

samples/gpu/nccl_run_allreduce_H100.sbatch

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,7 @@ fi
6161
-x RX_QUEUE_LEN=8192 \
6262
-x IB_RX_QUEUE_LEN=8192 \
6363
-x NCCL_SOCKET_IFNAME=eth0 \
64-
-x NCCL_ALGO=auto \
6564
-x NCCL_IGNORE_CPU_AFFINITY=1 \
6665
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
6766
-x NCCL_TOPO_FILE=~/H100-topology.xml \
68-
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
67+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1

samples/gpu/nccl_run_allreduce_H100.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,10 @@ do
7575
-x RX_QUEUE_LEN=8192 \
7676
-x IB_RX_QUEUE_LEN=8192 \
7777
-x NCCL_SOCKET_IFNAME=eth0 \
78-
-x NCCL_ALGO=auto \
7978
-x NCCL_IGNORE_CPU_AFFINITY=1 \
8079
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
8180
-x NCCL_TOPO_FILE=~/H100-topology.xml \
82-
--np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 >> $logfile
81+
--np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >> $logfile
8382

8483
tail -n 32 $logfile
8584

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-slurm
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
10+
cd /nfs/cluster
11+
mkdir $SLURM_JOB_ID
12+
cd $SLURM_JOB_ID
13+
14+
MACHINEFILE="hostfile"
15+
ORDEREDMACHINEFILE="ordered_hostfile_system_name"
16+
ORDEREDRANKMACHINEFILE="rankfile_system_name"
17+
18+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
19+
echo MACHINEFILE
20+
cat $MACHINEFILE
21+
22+
source /etc/os-release
23+
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
24+
python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
25+
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
26+
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
27+
fi
28+
29+
30+
echo ORDEREDMACHINEFILE
31+
cat $ORDEREDMACHINEFILE
32+
echo ORDEREDRANKMACHINEFILE
33+
cat $ORDEREDRANKMACHINEFILE
34+
35+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
36+
37+
if [[ "$mpivars_path" == "" ]]; then
38+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
39+
fi
40+
41+
if [[ "$mpivars_path" == "" ]]; then
42+
echo "Could not find MPIPATH"; exit; fi
43+
44+
source $mpivars_path
45+
46+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
47+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
48+
then
49+
var_UCX_NET_DEVICES=mlx5_0:1
50+
elif [ $shape == \"BM.GPU4.8\" ]
51+
then
52+
var_UCX_NET_DEVICES=mlx5_4:1
53+
fi
54+
55+
mpirun --mca pml ucx \
56+
--bind-to numa \
57+
--mca coll ^hcoll \
58+
-x UCX_TLS=ud,self,sm \
59+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
60+
-x HCOLL_ENABLE_MCAST_ALL=0 \
61+
-x coll_hcoll_enable=0 \
62+
-x NCCL_ALGO=Ring \
63+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter
5+
max=$1
6+
7+
# This assume, the hostfile passed is already ordered based on their rackId
8+
if [ -n "$2" ]; then
9+
hostfile=$2
10+
else
11+
hostfile="/tmp/ordered_hostfile_system_name"
12+
fi
13+
14+
ORDEREDMACHINEFILE="ordered_hostfile_system_name"
15+
ORDEREDRANKMACHINEFILE="rankfile_system_name"
16+
echo INPUTFILE
17+
cat $hostfile
18+
19+
# will generate rack-aware ordered host file
20+
source /etc/os-release
21+
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
22+
python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
23+
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
24+
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
25+
fi
26+
27+
hostfile=$ORDEREDMACHINEFILE
28+
rankfile=$ORDEREDRANKMACHINEFILE
29+
30+
echo ORDEREDMACHINEFILE
31+
cat $ORDEREDMACHINEFILE
32+
echo ORDEREDRANKMACHINEFILE
33+
cat $ORDEREDRANKMACHINEFILE
34+
35+
# The number of GPUs to use for the test. Has to be multiplier of 8. If not passed, all GPUs will be used.
36+
if [ -n "$3" ]; then
37+
np=$3
38+
else
39+
np=$((`less $hostfile | wc -l` * 8 ))
40+
fi
41+
42+
logfile="nccl_run_allreduce.sh.log"
43+
44+
for x in $(seq 1 1 $max)
45+
do
46+
47+
echo $x
48+
echo $x >> $logfile
49+
date >> $logfile
50+
51+
rankfile=$rankfile; np=$np ; iter=20;
52+
53+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
54+
source $mpivars_path
55+
56+
if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi
57+
58+
first_node=`head $hostfile -n 1`
59+
shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
60+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
61+
then
62+
var_UCX_NET_DEVICES=mlx5_0:1
63+
elif [ $shape == \"BM.GPU4.8\" ]
64+
then
65+
var_UCX_NET_DEVICES=mlx5_4:1
66+
fi
67+
68+
# final version
69+
# all NCCL parameters are at /etc/nccl.conf on each compute node.
70+
mpirun --mca pml ucx \
71+
--bind-to numa \
72+
--mca coll ^hcoll \
73+
-x UCX_TLS=ud,self,sm \
74+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
75+
-x HCOLL_ENABLE_MCAST_ALL=0 \
76+
-x coll_hcoll_enable=0 \
77+
-x NCCL_ALGO=Ring \
78+
--np $np --rankfile $rankfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >> $logfile
79+
80+
tail -n 32 $logfile
81+
82+
83+
done
84+
85+
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-slurm
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
10+
cd /nfs/cluster
11+
mkdir $SLURM_JOB_ID
12+
cd $SLURM_JOB_ID
13+
14+
MACHINEFILE="hostfile"
15+
16+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
17+
echo MACHINEFILE
18+
cat $MACHINEFILE
19+
20+
source /etc/os-release
21+
22+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
23+
24+
if [[ "$mpivars_path" == "" ]]; then
25+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
26+
fi
27+
28+
if [[ "$mpivars_path" == "" ]]; then
29+
echo "Could not find MPIPATH"; exit; fi
30+
31+
source $mpivars_path
32+
33+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
34+
if [ $shape == \"BM.GPU.H100.8\" ]
35+
then
36+
var_UCX_NET_DEVICES=eth0
37+
else
38+
echo "Use the appropriate nccl test run script for non H100 nodes"
39+
fi
40+
41+
# all NCCL parameters are at /etc/nccl.conf on each compute node.
42+
mpirun --mca pml ucx \
43+
--bind-to numa \
44+
-npernode 8 \
45+
--mca coll ^hcoll \
46+
-x HCOLL_ENABLE_MCAST_ALL=0 \
47+
-x coll_hcoll_enable=0 \
48+
-x UCX_TLS=tcp \
49+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
50+
-x RX_QUEUE_LEN=8192 \
51+
-x IB_RX_QUEUE_LEN=8192 \
52+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# number of times to run the nccl test to stress the GPUs and RDMA network.
5+
max=$1
6+
7+
# This assume, the hostfile passed is already ordered based on their rackId
8+
if [ -n "$2" ]; then
9+
hostfile=$2
10+
else
11+
hostfile="/etc/opt/oci-hpc/hostfile.tcp"
12+
fi
13+
14+
echo INPUTFILE
15+
cat $hostfile
16+
17+
# The number of GPUs to use for the test. Has to be multiplier of 8. If not passed, all GPUs will be used.
18+
if [ -n "$3" ]; then
19+
np=$3
20+
else
21+
np=$((`less $hostfile | wc -l` * 8 ))
22+
fi
23+
24+
logfile="nccl_run_allreduce.sh.log"
25+
26+
for x in $(seq 1 1 $max)
27+
do
28+
29+
echo $x
30+
echo $x >> $logfile
31+
date >> $logfile
32+
33+
hostfile=$hostfile; np=$np;
34+
35+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
36+
37+
if [[ "$mpivars_path" == "" ]]; then
38+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
39+
fi
40+
41+
if [[ "$mpivars_path" == "" ]]; then
42+
echo "Could not find MPIPATH"; exit; fi
43+
44+
source $mpivars_path
45+
46+
first_node=`head $hostfile -n 1`
47+
shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
48+
if [ $shape == \"BM.GPU.H100.8\" ]
49+
then
50+
var_UCX_NET_DEVICES=eth0
51+
else
52+
echo "Use the appropriate nccl test run script for non H100 nodes"
53+
fi
54+
55+
# all NCCL parameters are at /etc/nccl.conf on each compute node.
56+
mpirun --mca pml ucx \
57+
--bind-to numa \
58+
-npernode 8 \
59+
--mca coll ^hcoll \
60+
-x HCOLL_ENABLE_MCAST_ALL=0 \
61+
-x coll_hcoll_enable=0 \
62+
-x UCX_TLS=tcp \
63+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
64+
-x RX_QUEUE_LEN=8192 \
65+
-x IB_RX_QUEUE_LEN=8192 \
66+
--np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >> $logfile
67+
68+
tail -n 32 $logfile
69+
70+
71+
done

0 commit comments

Comments
 (0)