|
| 1 | +#!/bin/bash |
| 2 | +#SBATCH --job-name=nccl-allreduce-slurm |
| 3 | +#SBATCH --nodes=2 |
| 4 | +#SBATCH --gpus-per-node=8 |
| 5 | +#SBATCH --ntasks-per-node=8 |
| 6 | +#SBATCH --exclusive |
| 7 | +export PMI_DEBUG=1 |
| 8 | + |
| 9 | + |
| 10 | +cd /nfs/scratch |
| 11 | +mkdir $SLURM_JOB_ID |
| 12 | +cd $SLURM_JOB_ID |
| 13 | + |
| 14 | +MACHINEFILE="hostfile" |
| 15 | +ORDEREDMACHINEFILE="ordered_hostfile_system_name" |
| 16 | +ORDEREDRANKMACHINEFILE="rankfile_system_name" |
| 17 | + |
| 18 | +scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE |
| 19 | +echo MACHINEFILE |
| 20 | +cat $MACHINEFILE |
| 21 | + |
| 22 | +source /etc/os-release |
| 23 | +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then |
| 24 | + python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null |
| 25 | +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then |
| 26 | + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null |
| 27 | +fi |
| 28 | + |
| 29 | + |
| 30 | +echo ORDEREDMACHINEFILE |
| 31 | +cat $ORDEREDMACHINEFILE |
| 32 | +echo ORDEREDRANKMACHINEFILE |
| 33 | +cat $ORDEREDRANKMACHINEFILE |
| 34 | + |
| 35 | +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` |
| 36 | + |
| 37 | +if [[ "$mpivars_path" == "" ]]; then |
| 38 | + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` |
| 39 | +fi |
| 40 | + |
| 41 | +if [[ "$mpivars_path" == "" ]]; then |
| 42 | + echo "Could not find MPIPATH"; exit; fi |
| 43 | + |
| 44 | +source $mpivars_path |
| 45 | + |
| 46 | +export NCCL_DEBUG=WARN |
| 47 | + |
| 48 | +#mpirun -d --mca pml ucx -x SLURM_JOB_NODELIST=$host_list --bind-to numa -x NCCL_DEBUG=WARN -x NCCL_IB_SL=0 -x NCCL_IB_TC=41 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_GID_INDEX=3 -x NCCL_ALGO=Ring -x NCCL_TOPO_FILE=/home/opc/topo-flattened-b4.xml -x NCCL_IB_HCA="mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_16,mlx5_17,mlx5_18,mlx5_19" -x UCX_NET_DEVICES=mlx5_0:1 -x HCOLL_ENABLE_MCAST_ALL=0 -x coll_hcoll_enable=0 -x UCX_TLS=ud,self,sm -np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile rankfile_system_name /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 |
| 49 | +# no need to pass: -x SLURM_JOB_NODELIST=$host_list |
| 50 | + |
| 51 | +shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` |
| 52 | +if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] |
| 53 | +then |
| 54 | + var_UCX_NET_DEVICES=mlx5_0:1 |
| 55 | + var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12" |
| 56 | +elif [ $shape == \"BM.GPU4.8\" ] |
| 57 | +then |
| 58 | + var_UCX_NET_DEVICES=mlx5_4:1 |
| 59 | + var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17" |
| 60 | +fi |
| 61 | + |
| 62 | +NCCL_version=`sudo ldconfig -v 2>&1 | grep "libnccl.so" | tail -n1 | sed -r 's/^.*\.so\.//'` |
| 63 | +arr_NCCL=(${NCCL_version//./ }) |
| 64 | +if [ ${arr_NCCL[2]} > 20 ] |
| 65 | +then |
| 66 | + tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.2.0.1 |
| 67 | +else |
| 68 | + tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.1.0.2 |
| 69 | +fi |
| 70 | + |
| 71 | + |
| 72 | + mpirun --mca pml ucx \ |
| 73 | + --bind-to numa \ |
| 74 | + --mca coll ^hcoll \ |
| 75 | + -x NCCL_DEBUG=WARN \ |
| 76 | + -x NCCL_IB_SL=0 \ |
| 77 | + -x NCCL_IB_TC=41 \ |
| 78 | + -x NCCL_IB_QPS_PER_CONNECTION=4 \ |
| 79 | + -x UCX_TLS=ud,self,sm \ |
| 80 | + -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ |
| 81 | + -x HCOLL_ENABLE_MCAST_ALL=0 \ |
| 82 | + -x coll_hcoll_enable=0 \ |
| 83 | + -x NCCL_IB_GID_INDEX=3 \ |
| 84 | + -x NCCL_TUNER_PLUGIN=${tuner_path} \ |
| 85 | + -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ |
| 86 | + --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 |
| 87 | + |
| 88 | + |
0 commit comments