Skip to content

Commit baa8790

Browse files
Merge branch '2.10.4' of https://github.com/oci-hpc/oci-hpc-clusternetwork-dev into 2.10.4
2 parents 96c33ff + 5d30de6 commit baa8790

File tree

2 files changed

+106
-10
lines changed

2 files changed

+106
-10
lines changed

samples/gpu/nccl_run_allreduce_H100.sbatch

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
export PMI_DEBUG=1
88

99

10-
cd /nfs/scratch
10+
cd /nfs/cluster
1111
mkdir $SLURM_JOB_ID
1212
cd $SLURM_JOB_ID
1313

@@ -30,8 +30,6 @@ if [[ "$mpivars_path" == "" ]]; then
3030

3131
source $mpivars_path
3232

33-
export NCCL_DEBUG=WARN
34-
3533
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
3634
if [ $shape == \"BM.GPU.H100.8\" ]
3735
then
@@ -43,17 +41,28 @@ fi
4341

4442
mpirun --mca pml ucx \
4543
--bind-to numa \
44+
-npernode 8 \
4645
--mca coll ^hcoll \
46+
-x NCCL_CROSS_NIC=0 \
47+
-x NCCL_SOCKET_NTHREADS=16 \
4748
-x NCCL_DEBUG=WARN \
48-
-x NCCL_IB_SL=0 \
49-
-x NCCL_IB_TC=41 \
49+
-x NCCL_CUMEM_ENABLE=0 \
50+
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
5051
-x NCCL_IB_QPS_PER_CONNECTION=16 \
51-
-x UCX_TLS=tcp \
52-
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
52+
-x NCCL_IB_GID_INDEX=3 \
53+
-x NCCL_IB_TC=41 \
54+
-x NCCL_IB_SL=0 \
55+
-x NCCL_IB_TIMEOUT=22 \
56+
-x NCCL_NET_PLUGIN=none \
5357
-x HCOLL_ENABLE_MCAST_ALL=0 \
5458
-x coll_hcoll_enable=0 \
55-
-x NCCL_IB_GID_INDEX=3 \
56-
-x NCCL_ALGO=Auto \
59+
-x UCX_TLS=tcp \
60+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
61+
-x RX_QUEUE_LEN=8192 \
62+
-x IB_RX_QUEUE_LEN=8192 \
63+
-x NCCL_SOCKET_IFNAME=eth0 \
64+
-x NCCL_ALGO=auto \
65+
-x NCCL_IGNORE_CPU_AFFINITY=1 \
5766
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
5867
-x NCCL_TOPO_FILE=~/H100-topology.xml \
59-
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
68+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter
5+
max=$1
6+
7+
# This assume, the hostfile passed is already ordered based on their rackId
8+
if [ -n "$2" ]; then
9+
hostfile=$2
10+
else
11+
hostfile="/etc/opt/oci-hpc/hostfile.tcp"
12+
fi
13+
14+
echo INPUTFILE
15+
cat $hostfile
16+
17+
# The number of GPUs to use for the test. Has to be multiplier of 8. If not passed, all GPUs will be used.
18+
if [ -n "$3" ]; then
19+
np=$3
20+
else
21+
np=$((`less $hostfile | wc -l` * 8 ))
22+
fi
23+
24+
logfile="nccl_run_allreduce.sh.log"
25+
26+
for x in $(seq 1 1 $max)
27+
do
28+
29+
echo $x
30+
echo $x >> $logfile
31+
date >> $logfile
32+
33+
hostfile=$hostfile; np=$np ; iter=20;
34+
35+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
36+
37+
if [[ "$mpivars_path" == "" ]]; then
38+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
39+
fi
40+
41+
if [[ "$mpivars_path" == "" ]]; then
42+
echo "Could not find MPIPATH"; exit; fi
43+
44+
source $mpivars_path
45+
46+
first_node=`head $hostfile -n 1`
47+
shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
48+
if [ $shape == \"BM.GPU.H100.8\" ]
49+
then
50+
var_UCX_NET_DEVICES=eth0
51+
var_NCCL_IB_HCA="=mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17"
52+
else
53+
echo "Use the appropriate nccl test run script for non H100 nodes"
54+
fi
55+
56+
mpirun --mca pml ucx \
57+
--bind-to numa \
58+
-npernode 8 \
59+
--mca coll ^hcoll \
60+
-x NCCL_CROSS_NIC=0 \
61+
-x NCCL_SOCKET_NTHREADS=16 \
62+
-x NCCL_DEBUG=WARN \
63+
-x NCCL_CUMEM_ENABLE=0 \
64+
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
65+
-x NCCL_IB_QPS_PER_CONNECTION=16 \
66+
-x NCCL_IB_GID_INDEX=3 \
67+
-x NCCL_IB_TC=41 \
68+
-x NCCL_IB_SL=0 \
69+
-x NCCL_IB_TIMEOUT=22 \
70+
-x NCCL_NET_PLUGIN=none \
71+
-x HCOLL_ENABLE_MCAST_ALL=0 \
72+
-x coll_hcoll_enable=0 \
73+
-x UCX_TLS=tcp \
74+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
75+
-x RX_QUEUE_LEN=8192 \
76+
-x IB_RX_QUEUE_LEN=8192 \
77+
-x NCCL_SOCKET_IFNAME=eth0 \
78+
-x NCCL_ALGO=auto \
79+
-x NCCL_IGNORE_CPU_AFFINITY=1 \
80+
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
81+
-x NCCL_TOPO_FILE=~/H100-topology.xml \
82+
--np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 >> $logfile
83+
84+
tail -n 32 $logfile
85+
86+
87+
done

0 commit comments

Comments
 (0)