updated code so that ansible stdout matches the output shape. added no nccl param scripts.

shethdhvani · shethdhvani · commit 95645e7d48db · 2024-01-25T17:10:48.000-08:00
diff --git a/playbooks/roles/nccl-conf/files/a100_b4.8 b/playbooks/roles/nccl-conf/files/a100_b4.8
@@ -4,5 +4,4 @@ NCCL_IB_SL=0
 NCCL_IB_TC=41
 NCCL_IB_QPS_PER_CONNECTION=4
 NCCL_IB_GID_INDEX=3
-NCCL_ALGO=Ring
-NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
+NCCL_IB_HCA==mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12
diff --git a/playbooks/roles/nccl-conf/files/bm.gpu4.8 b/playbooks/roles/nccl-conf/files/bm.gpu4.8
@@ -4,5 +4,4 @@ NCCL_IB_SL=0
 NCCL_IB_TC=41
 NCCL_IB_QPS_PER_CONNECTION=4
 NCCL_IB_GID_INDEX=3
-NCCL_ALGO=Ring
-NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
+NCCL_IB_HCA==mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17
diff --git a/playbooks/roles/nccl-conf/files/h100 b/playbooks/roles/nccl-conf/files/h100
@@ -10,7 +10,6 @@ NCCL_IB_SL=0
 NCCL_IB_TIMEOUT=22
 NCCL_NET_PLUGIN=none
 NCCL_SOCKET_IFNAME=eth0
-NCCL_ALGO=auto
 NCCL_IGNORE_CPU_AFFINITY=1
-NCCL_IB_HCA="=mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17"
+NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17
 NCCL_TOPO_FILE=/nfs/cluster/H100-topology.xml
diff --git a/playbooks/roles/nccl-conf/tasks/main.yml b/playbooks/roles/nccl-conf/tasks/main.yml
@@ -13,7 +13,7 @@
     owner: root
     group: root
     mode: '0644'
-  when: shape_nccl.stdout == "BM.GPU.H100.8"
+  when: shape_nccl.stdout == '"BM.GPU.H100.8"'
 
 - name: copy nccl.conf for BM.GPU.B4.8 and A100-v2.8
   become: true
@@ -23,7 +23,7 @@
     owner: root
     group: root
     mode: '0644'
-  when: shape_nccl.stdout == "BM.GPU.B4.8" or shape_nccl.stdout == "BM.GPU.A100-v2.8"
+  when: shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"'
 
 - name: copy nccl.conf for BM.GPU4.8
   become: true
@@ -33,4 +33,4 @@
     owner: root
     group: root
     mode: '0644'
-  when: shape_nccl.stdout == "BM.GPU4.8"
+  when: shape_nccl.stdout == '"BM.GPU4.8"'
diff --git a/samples/gpu/nccl_run_allreduce_H100.sbatch b/samples/gpu/nccl_run_allreduce_H100.sbatch
@@ -61,8 +61,7 @@ fi
   -x RX_QUEUE_LEN=8192 \
   -x IB_RX_QUEUE_LEN=8192 \
   -x NCCL_SOCKET_IFNAME=eth0 \
-  -x NCCL_ALGO=auto \
   -x NCCL_IGNORE_CPU_AFFINITY=1 \
   -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
   -x NCCL_TOPO_FILE=~/H100-topology.xml \
-  --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
+  --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
diff --git a/samples/gpu/nccl_run_allreduce_H100.sh b/samples/gpu/nccl_run_allreduce_H100.sh
@@ -75,11 +75,10 @@ do
   -x RX_QUEUE_LEN=8192 \
   -x IB_RX_QUEUE_LEN=8192 \
   -x NCCL_SOCKET_IFNAME=eth0 \
-  -x NCCL_ALGO=auto \
   -x NCCL_IGNORE_CPU_AFFINITY=1 \
   -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
   -x NCCL_TOPO_FILE=~/H100-topology.xml \
-  --np $np --hostfile $hostfile  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 >>  $logfile
+  --np $np --hostfile $hostfile  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >>  $logfile
 
   tail -n 32 $logfile
 
diff --git a/samples/gpu/no_ncclparam_nccl_run_allreduce.sbatch b/samples/gpu/no_ncclparam_nccl_run_allreduce.sbatch
@@ -0,0 +1,63 @@
+#!/bin/bash
+#SBATCH --job-name=nccl-allreduce-slurm
+#SBATCH --nodes=2
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --exclusive
+export PMI_DEBUG=1
+
+
+cd /nfs/cluster
+mkdir $SLURM_JOB_ID
+cd $SLURM_JOB_ID
+
+MACHINEFILE="hostfile"
+ORDEREDMACHINEFILE="ordered_hostfile_system_name"
+ORDEREDRANKMACHINEFILE="rankfile_system_name"
+
+scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
+echo MACHINEFILE
+cat $MACHINEFILE
+
+source /etc/os-release
+if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
+    python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
+elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
+    python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
+fi
+
+
+echo ORDEREDMACHINEFILE
+cat $ORDEREDMACHINEFILE
+echo ORDEREDRANKMACHINEFILE
+cat $ORDEREDRANKMACHINEFILE
+
+mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
+
+if [[ "$mpivars_path" == "" ]]; then
+    mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
+fi
+
+if [[ "$mpivars_path" == "" ]]; then
+    echo "Could not find MPIPATH"; exit; fi
+
+source $mpivars_path
+
+shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
+if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
+then
+  var_UCX_NET_DEVICES=mlx5_0:1
+elif [ $shape == \"BM.GPU4.8\" ]
+then
+  var_UCX_NET_DEVICES=mlx5_4:1
+fi
+
+  mpirun --mca pml ucx \
+  --bind-to numa \
+  --mca coll ^hcoll \
+  -x UCX_TLS=ud,self,sm \
+  -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
+  -x HCOLL_ENABLE_MCAST_ALL=0 \
+  -x coll_hcoll_enable=0 \
+  -x NCCL_ALGO=Ring \
+  --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE))  --rankfile $ORDEREDRANKMACHINEFILE  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
diff --git a/samples/gpu/no_ncclparam_nccl_run_allreduce.sh b/samples/gpu/no_ncclparam_nccl_run_allreduce.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+set -e
+
+# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter
+max=$1
+
+# This assume, the hostfile  passed is already ordered based on their rackId
+if [ -n "$2" ]; then
+  hostfile=$2
+else
+  hostfile="/tmp/ordered_hostfile_system_name"
+fi
+
+ORDEREDMACHINEFILE="ordered_hostfile_system_name"
+ORDEREDRANKMACHINEFILE="rankfile_system_name"
+echo INPUTFILE
+cat $hostfile
+
+# will generate rack-aware ordered host file
+source /etc/os-release
+if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
+    python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
+elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
+    python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
+fi
+
+hostfile=$ORDEREDMACHINEFILE
+rankfile=$ORDEREDRANKMACHINEFILE
+
+echo ORDEREDMACHINEFILE
+cat $ORDEREDMACHINEFILE
+echo ORDEREDRANKMACHINEFILE
+cat $ORDEREDRANKMACHINEFILE
+
+# The number of GPUs to use for the test.  Has to be multiplier of 8.  If not passed, all GPUs will be used. 
+if [ -n "$3" ]; then
+  np=$3
+else
+  np=$((`less $hostfile | wc -l` * 8 ))
+fi
+
+logfile="nccl_run_allreduce.sh.log"
+
+for x in $(seq 1 1 $max)
+do
+
+  echo $x
+  echo $x >> $logfile
+  date >> $logfile
+
+  rankfile=$rankfile; np=$np ; iter=20;
+
+  mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
+  source $mpivars_path
+
+  if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi
+
+first_node=`head $hostfile -n 1`
+shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
+if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
+then
+  var_UCX_NET_DEVICES=mlx5_0:1
+elif [ $shape == \"BM.GPU4.8\" ]
+then
+  var_UCX_NET_DEVICES=mlx5_4:1
+fi
+
+  # final version
+  # all NCCL parameters are at /etc/nccl.conf on each compute node.
+  mpirun --mca pml ucx \
+  --bind-to numa \
+  --mca coll ^hcoll \
+  -x UCX_TLS=ud,self,sm \
+  -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
+  -x HCOLL_ENABLE_MCAST_ALL=0 \
+  -x coll_hcoll_enable=0 \
+  -x NCCL_ALGO=Ring \
+  --np $np --rankfile $rankfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >>  $logfile
+
+  tail -n 32 $logfile
+
+
+done
+
+
diff --git a/samples/gpu/no_ncclparam_nccl_run_allreduce_H100.sbatch b/samples/gpu/no_ncclparam_nccl_run_allreduce_H100.sbatch
@@ -0,0 +1,52 @@
+#!/bin/bash
+#SBATCH --job-name=nccl-allreduce-slurm
+#SBATCH --nodes=2
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --exclusive
+export PMI_DEBUG=1
+
+
+cd /nfs/cluster
+mkdir $SLURM_JOB_ID
+cd $SLURM_JOB_ID
+
+MACHINEFILE="hostfile"
+
+scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
+echo MACHINEFILE
+cat $MACHINEFILE
+
+source /etc/os-release
+
+mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
+
+if [[ "$mpivars_path" == "" ]]; then
+    mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
+fi
+
+if [[ "$mpivars_path" == "" ]]; then
+    echo "Could not find MPIPATH"; exit; fi
+
+source $mpivars_path
+
+shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
+if [ $shape == \"BM.GPU.H100.8\" ]
+then
+  var_UCX_NET_DEVICES=eth0
+else
+  echo "Use the appropriate nccl test run script for non H100 nodes"
+fi
+
+  # all NCCL parameters are at /etc/nccl.conf on each compute node.
+  mpirun --mca pml ucx \
+  --bind-to numa \
+  -npernode 8 \
+  --mca coll ^hcoll \
+  -x HCOLL_ENABLE_MCAST_ALL=0 \
+  -x coll_hcoll_enable=0 \
+  -x UCX_TLS=tcp \
+  -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
+  -x RX_QUEUE_LEN=8192 \
+  -x IB_RX_QUEUE_LEN=8192 \
+  --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
diff --git a/samples/gpu/no_ncclparam_nccl_run_allreduce_H100.sh b/samples/gpu/no_ncclparam_nccl_run_allreduce_H100.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+set -e
+
+# number of times to run the nccl test to stress the GPUs and RDMA network. 
+max=$1
+
+# This assume, the hostfile  passed is already ordered based on their rackId
+if [ -n "$2" ]; then
+  hostfile=$2
+else
+  hostfile="/etc/opt/oci-hpc/hostfile.tcp"
+fi
+
+echo INPUTFILE
+cat $hostfile
+
+# The number of GPUs to use for the test.  Has to be multiplier of 8.  If not passed, all GPUs will be used.
+if [ -n "$3" ]; then
+  np=$3
+else
+  np=$((`less $hostfile | wc -l` * 8 ))
+fi
+
+logfile="nccl_run_allreduce.sh.log"
+
+for x in $(seq 1 1 $max)
+do
+
+  echo $x
+  echo $x >> $logfile
+  date >> $logfile
+
+  hostfile=$hostfile; np=$np;
+
+  mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
+
+  if [[ "$mpivars_path" == "" ]]; then
+      mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
+  fi
+
+  if [[ "$mpivars_path" == "" ]]; then
+      echo "Could not find MPIPATH"; exit; fi
+
+  source $mpivars_path
+
+  first_node=`head $hostfile -n 1`
+  shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
+  if [ $shape == \"BM.GPU.H100.8\" ]
+  then
+    var_UCX_NET_DEVICES=eth0
+  else
+    echo "Use the appropriate nccl test run script for non H100 nodes"
+  fi
+
+  # all NCCL parameters are at /etc/nccl.conf on each compute node.
+  mpirun --mca pml ucx \
+  --bind-to numa \
+  -npernode 8 \
+  --mca coll ^hcoll \
+  -x HCOLL_ENABLE_MCAST_ALL=0 \
+  -x coll_hcoll_enable=0 \
+  -x UCX_TLS=tcp \
+  -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
+  -x RX_QUEUE_LEN=8192 \
+  -x IB_RX_QUEUE_LEN=8192 \
+  --np $np --hostfile $hostfile  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >>  $logfile
+
+  tail -n 32 $logfile
+
+
+done