Skip to content

Commit 1b73901

Browse files
Merge pull request #162 from oci-hpc/2.10.5_ds_nccl_conf
Add the NCCL parameters to /etc/nccl.conf
2 parents 031dfb4 + 2dfb166 commit 1b73901

14 files changed

+426
-4
lines changed

playbooks/new_nodes.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@
5454
when: cluster_network|bool and not use_compute_agent|default(false)|bool
5555
- include_role:
5656
name: nvidia_peermem
57+
- include_role:
58+
name: nccl-conf
59+
when: cluster_network|bool
5760

5861
- hosts: controller,slurm_backup,login,compute
5962
become: true

playbooks/resize_add.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@
5252
when: cluster_network|bool and not use_compute_agent|default(false)|bool
5353
- include_role:
5454
name: nvidia_peermem
55+
- include_role:
56+
name: nccl-conf
57+
when: cluster_network|bool
5558

5659
- hosts: controller,slurm_backup,login,compute
5760
become: true
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
NCCL_DEBUG=WARN
2+
NCCL_IGNORE_CPU_AFFINITY=1
3+
NCCL_IB_SL=0
4+
NCCL_IB_TC=41
5+
NCCL_IB_QPS_PER_CONNECTION=4
6+
NCCL_IB_GID_INDEX=3
7+
NCCL_IB_HCA==mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
NCCL_DEBUG=WARN
2+
NCCL_IGNORE_CPU_AFFINITY=1
3+
NCCL_IB_SL=0
4+
NCCL_IB_TC=41
5+
NCCL_IB_QPS_PER_CONNECTION=4
6+
NCCL_IB_GID_INDEX=3
7+
NCCL_IB_HCA==mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
NCCL_CROSS_NIC=0
2+
NCCL_SOCKET_NTHREADS=16
3+
NCCL_DEBUG=WARN
4+
NCCL_CUMEM_ENABLE=0
5+
NCCL_IB_SPLIT_DATA_ON_QPS=0
6+
NCCL_IB_QPS_PER_CONNECTION=16
7+
NCCL_IB_GID_INDEX=3
8+
NCCL_IB_TC=41
9+
NCCL_IB_SL=0
10+
NCCL_IB_TIMEOUT=22
11+
NCCL_NET_PLUGIN=none
12+
NCCL_SOCKET_IFNAME=eth0
13+
NCCL_IGNORE_CPU_AFFINITY=1
14+
NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17
15+
NCCL_TOPO_FILE=/nfs/cluster/H100-topology.xml
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
---
2+
# tasks file for nccl-conf
3+
- name: Get the shape
4+
shell:
5+
cmd: "curl -sH \"Authorization: Bearer Oracle\" -L http://169.254.169.254/opc/v2/instance/ | jq '.shape'"
6+
register: shape_nccl
7+
8+
- name: copy nccl.conf for H100
9+
become: true
10+
copy:
11+
src: h100
12+
dest: /etc/nccl.conf
13+
owner: root
14+
group: root
15+
mode: '0644'
16+
when: shape_nccl.stdout == '"BM.GPU.H100.8"'
17+
18+
- name: copy nccl.conf for BM.GPU.B4.8 and A100-v2.8
19+
become: true
20+
copy:
21+
src: a100_b4.8
22+
dest: /etc/nccl.conf
23+
owner: root
24+
group: root
25+
mode: '0644'
26+
when: shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"'
27+
28+
- name: copy nccl.conf for BM.GPU4.8
29+
become: true
30+
copy:
31+
src: bm.gpu4.8
32+
dest: /etc/nccl.conf
33+
owner: root
34+
group: root
35+
mode: '0644'
36+
when: shape_nccl.stdout == '"BM.GPU4.8"'

playbooks/site.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@
6464
when: cluster_network|bool and not use_compute_agent|default(false)|bool
6565
- include_role:
6666
name: nvidia_peermem
67+
- include_role:
68+
name: nccl-conf
69+
when: cluster_network|bool
6770

6871
- hosts: controller
6972
become: true

samples/gpu/nccl_run_allreduce_H100.sbatch

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,7 @@ fi
6161
-x RX_QUEUE_LEN=8192 \
6262
-x IB_RX_QUEUE_LEN=8192 \
6363
-x NCCL_SOCKET_IFNAME=eth0 \
64-
-x NCCL_ALGO=auto \
6564
-x NCCL_IGNORE_CPU_AFFINITY=1 \
6665
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
6766
-x NCCL_TOPO_FILE=~/H100-topology.xml \
68-
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
67+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1

samples/gpu/nccl_run_allreduce_H100.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,10 @@ do
7575
-x RX_QUEUE_LEN=8192 \
7676
-x IB_RX_QUEUE_LEN=8192 \
7777
-x NCCL_SOCKET_IFNAME=eth0 \
78-
-x NCCL_ALGO=auto \
7978
-x NCCL_IGNORE_CPU_AFFINITY=1 \
8079
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
8180
-x NCCL_TOPO_FILE=~/H100-topology.xml \
82-
--np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 >> $logfile
81+
--np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >> $logfile
8382

8483
tail -n 32 $logfile
8584

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-slurm-containers
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
cd /nfs/cluster
10+
mkdir $SLURM_JOB_ID
11+
cd $SLURM_JOB_ID
12+
13+
MACHINEFILE="hostfile"
14+
15+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
16+
echo MACHINEFILE
17+
cat $MACHINEFILE
18+
19+
source /etc/os-release
20+
21+
MPIVARS_PATH=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
22+
23+
if [[ "$MPIVARS_PATH" == "" ]]; then
24+
MPIVARS_PATH=`ls /opt/openmpi-*/bin/mpivars.sh`
25+
fi
26+
27+
if [[ "$MPIVARS_PATH" == "" ]]; then
28+
echo "Could not find MPIPATH"; exit; fi
29+
30+
source $MPIVARS_PATH
31+
LOCAL_MPI=${MPIVARS_PATH%/*}
32+
33+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
34+
if [ $shape == \"BM.GPU.H100.8\" ]
35+
then
36+
var_UCX_NET_DEVICES=eth0
37+
else
38+
echo "Use the appropriate nccl test run script for non H100 nodes"
39+
fi
40+
41+
export NCCL_CROSS_NIC=0 \
42+
NCCL_SOCKET_NTHREADS=16 \
43+
NCCL_DEBUG=WARN \
44+
NCCL_CUMEM_ENABLE=0 \
45+
NCCL_IB_SPLIT_DATA_ON_QPS=0 \
46+
NCCL_IB_QPS_PER_CONNECTION=16 \
47+
NCCL_IB_GID_INDEX=3 \
48+
NCCL_IB_TC=41 \
49+
NCCL_IB_SL=0 \
50+
NCCL_IB_TIMEOUT=22 \
51+
NCCL_NET_PLUGIN=none \
52+
NCCL_SOCKET_IFNAME=eth0 \
53+
NCCL_IGNORE_CPU_AFFINITY=1 \
54+
NCCL_IB_HCA="=mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17" \
55+
NCCL_TOPO_FILE=/nfs/cluster/H100-topology.xml \
56+
HCOLL_ENABLE_MCAST_ALL=0 \
57+
coll_hcoll_enable=0 \
58+
UCX_TLS=tcp \
59+
UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
60+
RX_QUEUE_LEN=8192 \
61+
IB_RX_QUEUE_LEN=8192 \
62+
OMPI_MCA_coll=^hcoll
63+
64+
env | grep "SLURMD_NODENAME="
65+
USER=`whoami`
66+
67+
CONTAINER_IMAGE="/home/ubuntu/nvcr.io+nvidia+pytorch+24.01-py3.sqsh"
68+
CONTAINER_MOUNTS="/opt/oci-hpc/nccl-test:/nccl,$LOCAL_MPI:$LOCAL_MPI,/nfs/cluster:/nfs/cluster"
69+
echo $LOCAL_MPI
70+
echo $MPIVARS_PATH
71+
72+
srun --mpi=pmi2 --gpus-per-node=$SBATCH_GPUS_PER_NODE \
73+
--ntasks-per-node=$SLURM_NTASKS_PER_NODE \
74+
--container-image=$CONTAINER_IMAGE \
75+
--container-mounts=$CONTAINER_MOUNTS \
76+
bash -c "
77+
source $MPIVARS_PATH &&
78+
/nccl/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
79+
"

0 commit comments

Comments
 (0)