Skip to content

Commit caeb169

Browse files
committed
added container NCCL script for H100
1 parent 49e270c commit caeb169

File tree

1 file changed

+79
-0
lines changed

1 file changed

+79
-0
lines changed
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-slurm-containers
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
cd /nfs/cluster
10+
mkdir $SLURM_JOB_ID
11+
cd $SLURM_JOB_ID
12+
13+
MACHINEFILE="hostfile"
14+
15+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
16+
echo MACHINEFILE
17+
cat $MACHINEFILE
18+
19+
source /etc/os-release
20+
21+
MPIVARS_PATH=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
22+
23+
if [[ "$MPIVARS_PATH" == "" ]]; then
24+
MPIVARS_PATH=`ls /opt/openmpi-*/bin/mpivars.sh`
25+
fi
26+
27+
if [[ "$MPIVARS_PATH" == "" ]]; then
28+
echo "Could not find MPIPATH"; exit; fi
29+
30+
source $MPIVARS_PATH
31+
LOCAL_MPI=${MPIVARS_PATH%/*}
32+
33+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
34+
if [ $shape == \"BM.GPU.H100.8\" ]
35+
then
36+
var_UCX_NET_DEVICES=eth0
37+
else
38+
echo "Use the appropriate nccl test run script for non H100 nodes"
39+
fi
40+
41+
export NCCL_CROSS_NIC=0 \
42+
NCCL_SOCKET_NTHREADS=16 \
43+
NCCL_DEBUG=WARN \
44+
NCCL_CUMEM_ENABLE=0 \
45+
NCCL_IB_SPLIT_DATA_ON_QPS=0 \
46+
NCCL_IB_QPS_PER_CONNECTION=16 \
47+
NCCL_IB_GID_INDEX=3 \
48+
NCCL_IB_TC=41 \
49+
NCCL_IB_SL=0 \
50+
NCCL_IB_TIMEOUT=22 \
51+
NCCL_NET_PLUGIN=none \
52+
NCCL_SOCKET_IFNAME=eth0 \
53+
NCCL_IGNORE_CPU_AFFINITY=1 \
54+
NCCL_IB_HCA="=mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17" \
55+
NCCL_TOPO_FILE=/nfs/cluster/H100-topology.xml \
56+
HCOLL_ENABLE_MCAST_ALL=0 \
57+
coll_hcoll_enable=0 \
58+
UCX_TLS=tcp \
59+
UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
60+
RX_QUEUE_LEN=8192 \
61+
IB_RX_QUEUE_LEN=8192 \
62+
OMPI_MCA_coll=^hcoll
63+
64+
env | grep "SLURMD_NODENAME="
65+
USER=`whoami`
66+
67+
CONTAINER_IMAGE="/home/ubuntu/nvcr.io+nvidia+pytorch+24.01-py3.sqsh"
68+
CONTAINER_MOUNTS="/opt/oci-hpc/nccl-test:/nccl,$LOCAL_MPI:$LOCAL_MPI,/nfs/cluster:/nfs/cluster"
69+
echo $LOCAL_MPI
70+
echo $MPIVARS_PATH
71+
72+
srun --mpi=pmi2 --gpus-per-node=$SBATCH_GPUS_PER_NODE \
73+
--ntasks-per-node=$SLURM_NTASKS_PER_NODE \
74+
--container-image=$CONTAINER_IMAGE \
75+
--container-mounts=$CONTAINER_MOUNTS \
76+
bash -c "
77+
source $MPIVARS_PATH &&
78+
/nccl/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
79+
"

0 commit comments

Comments
 (0)