Skip to content

Commit 7078f0e

Browse files
committed
added NCCL with srun examples and running NCCL with containers example
1 parent 978c7c1 commit 7078f0e

File tree

3 files changed

+142
-0
lines changed

3 files changed

+142
-0
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-srun
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
10+
cd /nfs/scratch
11+
mkdir $SLURM_JOB_ID
12+
cd $SLURM_JOB_ID
13+
14+
MACHINEFILE="hostfile"
15+
16+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
17+
echo INPUTFILE
18+
cat $MACHINEFILE
19+
20+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
21+
22+
if [[ "$mpivars_path" == "" ]]; then
23+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
24+
fi
25+
26+
if [[ "$mpivars_path" == "" ]]; then
27+
echo "Could not find MPIPATH"; exit; fi
28+
29+
source $mpivars_path
30+
echo $mpivars_path
31+
32+
USER=`whoami`
33+
34+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
35+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
36+
then
37+
var_UCX_NET_DEVICES=mlx5_0:1
38+
var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
39+
elif [ $shape == \"BM.GPU4.8\" ]
40+
then
41+
var_UCX_NET_DEVICES=mlx5_4:1
42+
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
43+
fi
44+
45+
export NCCL_DEBUG=WARN \
46+
OMPI_MCA_coll=^hcoll \
47+
RX_QUEUE_LEN=8192 \
48+
IB_RX_QUEUE_LEN=8192 \
49+
NCCL_IGNORE_CPU_AFFINITY=1 \
50+
NCCL_IB_SL=0 \
51+
NCCL_IB_TC=41 \
52+
NCCL_IB_QPS_PER_CONNECTION=4 \
53+
UCX_TLS=ud,self,sm \
54+
UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
55+
HCOLL_ENABLE_MCAST_ALL=0 \
56+
coll_hcoll_enable=0 \
57+
NCCL_IB_GID_INDEX=3 \
58+
NCCL_ALGO=Ring \
59+
NCCL_IB_HCA="${var_NCCL_IB_HCA}"
60+
srun --mpi=pmix_v3 --gpus-per-node=$SLURM_GPUS_PER_NODE --ntasks-per-node=$SLURM_NTASKS_PER_NODE /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter
5+
max=$1
6+
7+
# This assumes that the hostfile passed is already ordered based on their rackId or slurm 23.02 and higher will order it based on topology
8+
if [ -n "$2" ]; then
9+
hostfile=$2
10+
else
11+
hostfile="/tmp/ordered_hostfile_system_name"
12+
fi
13+
14+
echo INPUTFILE
15+
cat $hostfile
16+
17+
if [ -n "$3" ]; then
18+
logfile=$3
19+
else
20+
logfile="nccl_run_allreduce_srun.sh.log"
21+
fi
22+
23+
echo $logfile
24+
25+
for x in $(seq 1 1 $max)
26+
do
27+
28+
echo $x
29+
echo $x >> $logfile
30+
date >> $logfile
31+
32+
hostfile=$hostfile
33+
34+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
35+
36+
if [[ "$mpivars_path" == "" ]]; then
37+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
38+
fi
39+
40+
if [[ "$mpivars_path" == "" ]]; then
41+
echo "Could not find MPIPATH"; exit; fi
42+
43+
source $mpivars_path
44+
echo $mpivars_path
45+
46+
USER=`whoami`
47+
48+
first_node=`head $hostfile -n 1`
49+
shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
50+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
51+
then
52+
var_UCX_NET_DEVICES=mlx5_0:1
53+
var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
54+
elif [ $shape == \"BM.GPU4.8\" ]
55+
then
56+
var_UCX_NET_DEVICES=mlx5_4:1
57+
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
58+
fi
59+
60+
export NCCL_DEBUG=WARN \
61+
OMPI_MCA_coll=^hcoll \
62+
RX_QUEUE_LEN=8192 \
63+
IB_RX_QUEUE_LEN=8192 \
64+
NCCL_IGNORE_CPU_AFFINITY=1 \
65+
NCCL_IB_SL=0 \
66+
NCCL_IB_TC=41 \
67+
NCCL_IB_QPS_PER_CONNECTION=4 \
68+
UCX_TLS=ud,self,sm \
69+
UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
70+
HCOLL_ENABLE_MCAST_ALL=0 \
71+
coll_hcoll_enable=0 \
72+
NCCL_IB_GID_INDEX=3 \
73+
NCCL_ALGO=Ring \
74+
NCCL_IB_HCA="${var_NCCL_IB_HCA}"
75+
srun --mpi=pmix_v3 --nodefile=$hostfile --gpus-per-node=8 --ntasks-per-node=8 /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 >> $logfile
76+
77+
78+
79+
tail -n 32 $logfile
80+
81+
82+
done

0 commit comments

Comments
 (0)