1+ #! /bin/bash
2+ # SBATCH --job-name=nccl-allreduce-slurm-containers
3+ # SBATCH --nodes=2
4+ # SBATCH --gpus-per-node=8
5+ # SBATCH --ntasks-per-node=8
6+ # SBATCH --exclusive
7+ export PMI_DEBUG=1
8+
9+
10+ cd /nfs/scratch
11+ mkdir $SLURM_JOB_ID
12+ cd $SLURM_JOB_ID
13+
14+ MACHINEFILE=" hostfile"
15+ ORDEREDMACHINEFILE=" ordered_hostfile_system_name"
16+ ORDEREDRANKMACHINEFILE=" rankfile_system_name"
17+ ORDEREDSRUNMACHINEFILE=" ordered_hostfile_system_name_srun"
18+
19+ scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
20+ echo MACHINEFILE
21+ cat $MACHINEFILE
22+
23+ source /etc/os-release
24+ if [ $ID == " ol" ] || [ $ID == " centos" ] ; then
25+ python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
26+ USER=opc
27+ elif [ $ID == " debian" ] || [ $ID == " ubuntu" ] ; then
28+ python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
29+ USER=ubuntu
30+ fi
31+
32+ echo ORDEREDMACHINEFILE
33+ cat $ORDEREDMACHINEFILE
34+ echo ORDEREDSRUNMACHINEFILE
35+ cat $ORDEREDSRUNMACHINEFILE
36+
37+ export SLURM_HOSTFILE=$ORDEREDSRUNMACHINEFILE
38+
39+ MPIVARS_PATH=` ls /usr/mpi/gcc/openmpi-* /bin/mpivars.sh`
40+
41+ if [[ " $MPIVARS_PATH " == " " ]]; then
42+ MPIVARS_PATH=` ls /opt/openmpi-* /bin/mpivars.sh`
43+ fi
44+
45+ if [[ " $MPIVARS_PATH " == " " ]]; then
46+ echo " Could not find MPIPATH" ; exit ; fi
47+
48+ source $MPIVARS_PATH
49+ LOCAL_MPI=${MPIVARS_PATH%%/ bin* }
50+
51+ # mpirun -d --mca pml ucx -x SLURM_JOB_NODELIST=$host_list --bind-to numa -x NCCL_DEBUG=WARN -x NCCL_IB_SL=0 -x NCCL_IB_TC=41 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_GID_INDEX=3 -x NCCL_ALGO=Ring -x NCCL_TOPO_FILE=/home/opc/topo-flattened-b4.xml -x NCCL_IB_HCA="mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_16,mlx5_17,mlx5_18,mlx5_19" -x UCX_NET_DEVICES=mlx5_0:1 -x HCOLL_ENABLE_MCAST_ALL=0 -x coll_hcoll_enable=0 -x UCX_TLS=ud,self,sm -np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile rankfile_system_name /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
52+ # no need to pass: -x SLURM_JOB_NODELIST=$host_list
53+
54+ shape=` curl -sH " Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
55+ if [ $shape == \" BM.GPU.B4.8\" ] || [ $shape == \" BM.GPU.A100-v2.8\" ]
56+ then
57+ var_UCX_NET_DEVICES=mlx5_0:1
58+ var_NCCL_IB_HCA=" =mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
59+ elif [ $shape == \" BM.GPU4.8\" ]
60+ then
61+ var_UCX_NET_DEVICES=mlx5_4:1
62+ var_NCCL_IB_HCA=" =mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
63+ fi
64+
65+ export RX_QUEUE_LEN=8192 \
66+ IB_RX_QUEUE_LEN=8192 \
67+ UCX_TLS=ud,self,sm \
68+ HCOLL_ENABLE_MCAST_ALL=0 \
69+ coll_hcoll_enable=0 \
70+ UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
71+ NCCL_DEBUG=WARN \
72+ NCCL_IB_TIMEOUT=16 \
73+ NCCL_IB_SL=0 \
74+ NCCL_IB_TC=41 \
75+ NCCL_IGNORE_CPU_AFFINITY=1 \
76+ NCCL_IB_GID_INDEX=3 \
77+ NCCL_ALGO=Ring \
78+ NCCL_IB_HCA=" ${var_NCCL_IB_HCA} " \
79+ OMPI_MCA_coll=^hcoll \
80+ NCCL_IB_QPS_PER_CONNECTION=4
81+
82+ env | grep " SLURMD_NODENAME="
83+
84+ CONTAINER_IMAGE=" /nfs/scratch/nvcr.io+nvidia+pytorch+22.12-py3.sqsh"
85+ CONTAINER_MOUNTS=" /home/$USER /nccl-tests:/nccl,$LOCAL_MPI :$LOCAL_MPI "
86+
87+ srun --mpi=pmi2 --gpus-per-node=$SBATCH_GPUS_PER_NODE \
88+ --ntasks-per-node=$SLURM_NTASKS_PER_NODE \
89+ --distribution=arbitrary \
90+ --container-image=$CONTAINER_IMAGE \
91+ --container-mounts=$CONTAINER_MOUNTS \
92+ bash -c "
93+ source $MPIVARS_PATH &&
94+ /nccl/build/all_reduce_perf -b 1G -e 10G -i$(( 1024 * 1024 * 1024 * 9 )) -n 100
95+ "
0 commit comments