Skip to content

Commit f4a6673

Browse files
Merge pull request #104 from oci-hpc/2.10.1_ds_updates
srun examples
2 parents 1da9969 + 0d808a1 commit f4a6673

File tree

5 files changed

+248
-13
lines changed

5 files changed

+248
-13
lines changed

playbooks/roles/rack-aware/files/node_ordering_by_rack.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,17 @@
44
import argparse
55
import subprocess
66

7-
def write_ordered_hostfile(ordered_hosts=[],hostfile=None):
7+
def write_ordered_hostfile(ordered_hosts=[],hostfile=None,srun=False):
88
#ordered_hostfile="ordered_hostfile"
99
if os.path.isfile(hostfile):
1010
os.remove(hostfile)
1111
fhandler = open(hostfile,"w")
1212
for h in ordered_hosts:
13-
fhandler.write(h+"\n")
13+
if srun:
14+
for x in range(8):
15+
fhandler.write(h+"\n")
16+
else:
17+
fhandler.write(h+"\n")
1418
fhandler.close()
1519

1620
def write_ordered_rankfile(ordered_hosts=[],hostfile=None):
@@ -47,18 +51,14 @@ def write_ordered_rankfile(ordered_hosts=[],hostfile=None):
4751
from pssh.clients import ParallelSSHClient
4852
client = ParallelSSHClient(hosts)
4953
output = client.run_command('curl http://169.254.169.254/opc/v1/host/')
50-
#print(output)
5154
for host_out in output:
5255
j = json.loads(bytearray(''.join(list(host_out.stdout)).encode()))
53-
#print(j)
5456
if j['rackId'] in r:
5557
r[j['rackId']].append( host_out.host )
5658
else:
5759
r[j['rackId']] = [ host_out.host ]
5860
hostname_output = client.run_command('/usr/bin/hostname')
59-
#print(hostname_output)
6061
for host_out in hostname_output:
61-
#j = bytearray(''.join(list(host_out.stdout)).encode())
6262
j = bytearray(''.join(list(host_out.stdout)).encode())
6363
friendly_name_to_system_hostname[host_out.host] = j.decode(encoding='ascii')
6464
#print(j.decode(encoding='ascii')+" "+host_out.host)
@@ -104,6 +104,8 @@ def write_ordered_rankfile(ordered_hosts=[],hostfile=None):
104104
write_ordered_hostfile(ordered_hosts,hostfile)
105105
hostfile="ordered_hostfile_system_name"
106106
write_ordered_hostfile(ordered_hosts_friendly_name,hostfile)
107+
hostfile="ordered_hostfile_system_name_srun"
108+
write_ordered_hostfile(ordered_hosts_friendly_name,hostfile,True)
107109
rankfile="rankfile_system_name"
108110
write_ordered_rankfile(ordered_hosts_friendly_name,rankfile)
109111

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-slurm-containers
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
10+
cd /nfs/scratch
11+
mkdir $SLURM_JOB_ID
12+
cd $SLURM_JOB_ID
13+
14+
MACHINEFILE="hostfile"
15+
ORDEREDMACHINEFILE="ordered_hostfile_system_name"
16+
ORDEREDRANKMACHINEFILE="rankfile_system_name"
17+
ORDEREDSRUNMACHINEFILE="ordered_hostfile_system_name_srun"
18+
19+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
20+
echo MACHINEFILE
21+
cat $MACHINEFILE
22+
23+
source /etc/os-release
24+
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
25+
python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
26+
USER=opc
27+
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
28+
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
29+
USER=ubuntu
30+
fi
31+
32+
echo ORDEREDMACHINEFILE
33+
cat $ORDEREDMACHINEFILE
34+
echo ORDEREDSRUNMACHINEFILE
35+
cat $ORDEREDSRUNMACHINEFILE
36+
37+
export SLURM_HOSTFILE=$ORDEREDSRUNMACHINEFILE
38+
39+
MPIVARS_PATH=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
40+
41+
if [[ "$MPIVARS_PATH" == "" ]]; then
42+
MPIVARS_PATH=`ls /opt/openmpi-*/bin/mpivars.sh`
43+
fi
44+
45+
if [[ "$MPIVARS_PATH" == "" ]]; then
46+
echo "Could not find MPIPATH"; exit; fi
47+
48+
source $MPIVARS_PATH
49+
LOCAL_MPI=${MPIVARS_PATH%%/bin*}
50+
51+
#mpirun -d --mca pml ucx -x SLURM_JOB_NODELIST=$host_list --bind-to numa -x NCCL_DEBUG=WARN -x NCCL_IB_SL=0 -x NCCL_IB_TC=41 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_GID_INDEX=3 -x NCCL_ALGO=Ring -x NCCL_TOPO_FILE=/home/opc/topo-flattened-b4.xml -x NCCL_IB_HCA="mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_16,mlx5_17,mlx5_18,mlx5_19" -x UCX_NET_DEVICES=mlx5_0:1 -x HCOLL_ENABLE_MCAST_ALL=0 -x coll_hcoll_enable=0 -x UCX_TLS=ud,self,sm -np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile rankfile_system_name /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
52+
# no need to pass: -x SLURM_JOB_NODELIST=$host_list
53+
54+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
55+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
56+
then
57+
var_UCX_NET_DEVICES=mlx5_0:1
58+
var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
59+
elif [ $shape == \"BM.GPU4.8\" ]
60+
then
61+
var_UCX_NET_DEVICES=mlx5_4:1
62+
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
63+
fi
64+
65+
export RX_QUEUE_LEN=8192 \
66+
IB_RX_QUEUE_LEN=8192 \
67+
UCX_TLS=ud,self,sm \
68+
HCOLL_ENABLE_MCAST_ALL=0 \
69+
coll_hcoll_enable=0 \
70+
UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
71+
NCCL_DEBUG=WARN \
72+
NCCL_IB_TIMEOUT=16 \
73+
NCCL_IB_SL=0 \
74+
NCCL_IB_TC=41 \
75+
NCCL_IGNORE_CPU_AFFINITY=1 \
76+
NCCL_IB_GID_INDEX=3 \
77+
NCCL_ALGO=Ring \
78+
NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
79+
OMPI_MCA_coll=^hcoll \
80+
NCCL_IB_QPS_PER_CONNECTION=4
81+
82+
env | grep "SLURMD_NODENAME="
83+
84+
CONTAINER_IMAGE="/nfs/scratch/nvcr.io+nvidia+pytorch+22.12-py3.sqsh"
85+
CONTAINER_MOUNTS="/home/$USER/nccl-tests:/nccl,$LOCAL_MPI:$LOCAL_MPI"
86+
87+
srun --mpi=pmi2 --gpus-per-node=$SBATCH_GPUS_PER_NODE \
88+
--ntasks-per-node=$SLURM_NTASKS_PER_NODE \
89+
--distribution=arbitrary \
90+
--container-image=$CONTAINER_IMAGE \
91+
--container-mounts=$CONTAINER_MOUNTS \
92+
bash -c "
93+
source $MPIVARS_PATH &&
94+
/nccl/build/all_reduce_perf -b 1G -e 10G -i$((1024*1024*1024*9)) -n 100
95+
"
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-srun
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
10+
cd /nfs/scratch
11+
mkdir $SLURM_JOB_ID
12+
cd $SLURM_JOB_ID
13+
14+
MACHINEFILE="hostfile"
15+
16+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
17+
echo INPUTFILE
18+
cat $MACHINEFILE
19+
20+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
21+
22+
if [[ "$mpivars_path" == "" ]]; then
23+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
24+
fi
25+
26+
if [[ "$mpivars_path" == "" ]]; then
27+
echo "Could not find MPIPATH"; exit; fi
28+
29+
source $mpivars_path
30+
echo $mpivars_path
31+
32+
USER=`whoami`
33+
34+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
35+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
36+
then
37+
var_UCX_NET_DEVICES=mlx5_0:1
38+
var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
39+
elif [ $shape == \"BM.GPU4.8\" ]
40+
then
41+
var_UCX_NET_DEVICES=mlx5_4:1
42+
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
43+
fi
44+
45+
export NCCL_DEBUG=WARN \
46+
OMPI_MCA_coll=^hcoll \
47+
RX_QUEUE_LEN=8192 \
48+
IB_RX_QUEUE_LEN=8192 \
49+
NCCL_IGNORE_CPU_AFFINITY=1 \
50+
NCCL_IB_SL=0 \
51+
NCCL_IB_TC=41 \
52+
NCCL_IB_QPS_PER_CONNECTION=4 \
53+
UCX_TLS=ud,self,sm \
54+
UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
55+
HCOLL_ENABLE_MCAST_ALL=0 \
56+
coll_hcoll_enable=0 \
57+
NCCL_IB_GID_INDEX=3 \
58+
NCCL_ALGO=Ring \
59+
NCCL_IB_HCA="${var_NCCL_IB_HCA}"
60+
srun --mpi=pmix_v3 --gpus-per-node=$SLURM_GPUS_PER_NODE --ntasks-per-node=$SLURM_NTASKS_PER_NODE /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter
5+
max=$1
6+
7+
# This assumes that the hostfile passed is already ordered based on their rackId or slurm 23.02 and higher will order it based on topology
8+
if [ -n "$2" ]; then
9+
hostfile=$2
10+
else
11+
hostfile="/tmp/ordered_hostfile_system_name"
12+
fi
13+
14+
echo INPUTFILE
15+
cat $hostfile
16+
17+
if [ -n "$3" ]; then
18+
logfile=$3
19+
else
20+
logfile="nccl_run_allreduce_srun.sh.log"
21+
fi
22+
23+
echo $logfile
24+
25+
for x in $(seq 1 1 $max)
26+
do
27+
28+
echo $x
29+
echo $x >> $logfile
30+
date >> $logfile
31+
32+
hostfile=$hostfile
33+
34+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
35+
36+
if [[ "$mpivars_path" == "" ]]; then
37+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
38+
fi
39+
40+
if [[ "$mpivars_path" == "" ]]; then
41+
echo "Could not find MPIPATH"; exit; fi
42+
43+
source $mpivars_path
44+
echo $mpivars_path
45+
46+
USER=`whoami`
47+
48+
first_node=`head $hostfile -n 1`
49+
shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
50+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
51+
then
52+
var_UCX_NET_DEVICES=mlx5_0:1
53+
var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
54+
elif [ $shape == \"BM.GPU4.8\" ]
55+
then
56+
var_UCX_NET_DEVICES=mlx5_4:1
57+
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
58+
fi
59+
60+
export NCCL_DEBUG=WARN \
61+
OMPI_MCA_coll=^hcoll \
62+
RX_QUEUE_LEN=8192 \
63+
IB_RX_QUEUE_LEN=8192 \
64+
NCCL_IGNORE_CPU_AFFINITY=1 \
65+
NCCL_IB_SL=0 \
66+
NCCL_IB_TC=41 \
67+
NCCL_IB_QPS_PER_CONNECTION=4 \
68+
UCX_TLS=ud,self,sm \
69+
UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
70+
HCOLL_ENABLE_MCAST_ALL=0 \
71+
coll_hcoll_enable=0 \
72+
NCCL_IB_GID_INDEX=3 \
73+
NCCL_ALGO=Ring \
74+
NCCL_IB_HCA="${var_NCCL_IB_HCA}"
75+
srun --mpi=pmix_v3 --nodefile=$hostfile --gpus-per-node=8 --ntasks-per-node=8 /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 >> $logfile
76+
77+
78+
79+
tail -n 32 $logfile
80+
81+
82+
done

samples/nccl_compile/compile.sh

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,10 @@ if [[ "$mpivars_path" == "" ]]; then
1414
source $mpivars_path
1515
MPI_HOME=${mpivars_path%%/bin*}
1616

17-
source /etc/os-release
18-
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
19-
cd /home/opc
20-
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
21-
cd /home/ubuntu
22-
fi
23-
17+
USER=`whoami`
2418

19+
cd /home/$USER
20+
rm -rf nccl-tests
2521
git clone https://github.com/NVIDIA/nccl-tests.git
2622
cd nccl-tests/
2723
make MPI=1 MPI_HOME=$MPI_HOME CUDA_HOME=/usr/local/cuda

0 commit comments

Comments
 (0)