Skip to content

Commit 251a3d4

Browse files
committed
Merge branch '2.10.5' into 2.10.5_ds_prompt_for_resize
2 parents da70984 + 0351666 commit 251a3d4

File tree

13 files changed

+227
-41
lines changed

13 files changed

+227
-41
lines changed

conf/variables.tpl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,12 @@ variable "marketplace_version_id" {
5656
"2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826"
5757
"3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229"
5858
"4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709"
59-
"HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2024.02.27-0"
60-
"HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2024.02.27-0"
61-
"GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-CUDA-12.3-2024.02.27-0"
62-
"GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-CUDA-12.3-2024.02.27-0"
59+
"HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0"
60+
"HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0"
61+
"GPU_OL7_CUDA12.2" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0"
62+
"GPU_OL8_CUDA12.2" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0"
63+
"GPU_OL7_CUDA12.4" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.4-2024.03.15-0"
64+
"GPU_OL8_CUDA12.4" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.4-2024.03.15-0"
6365
}
6466
}
6567

playbooks/new_nodes.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,6 @@
5454
when: cluster_network|bool and not use_compute_agent|default(false)|bool
5555
- include_role:
5656
name: nvidia_peermem
57-
- include_role:
58-
name: nccl-conf
59-
when: cluster_network|bool
6057

6158
- hosts: controller,slurm_backup,login,compute
6259
become: true
@@ -174,6 +171,9 @@
174171
when: enroot|default(true)|bool
175172
- include_role:
176173
name: tuned
174+
- include_role:
175+
name: nccl-conf
176+
when: cluster_network|bool
177177

178178
- hosts: compute
179179
tasks:

playbooks/resize_add.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,6 @@
5252
when: cluster_network|bool and not use_compute_agent|default(false)|bool
5353
- include_role:
5454
name: nvidia_peermem
55-
- include_role:
56-
name: nccl-conf
57-
when: cluster_network|bool
5855

5956
- hosts: controller,slurm_backup,login,compute
6057
become: true
@@ -176,6 +173,9 @@
176173
when: enroot|default(true)|bool
177174
- include_role:
178175
name: tuned
176+
- include_role:
177+
name: nccl-conf
178+
when: cluster_network|bool
179179

180180
- hosts: all
181181
become: true

playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,21 +114,21 @@
114114

115115
# - name: debug
116116
# debug:
117-
# msg: "Replacing line: SwitchName={{upperswitchnames[item]}}\\sSwitches.* with SwitchName={{upperswitchnames[item]}} Switches={{racks_on_switch_dict[item] | difference(switchnames[item]) | join(',') }}"
117+
# msg: "Replacing line: SwitchName={{upperswitchnames[item]}}\\sSwitches.* with SwitchName={{upperswitchnames[item]}} Switches={{racks_on_switch_dict[item] | difference([switchnames[item]]) | join(',') }}"
118118
# with_items: "{{unreachable_slurm_nodes}}"
119-
# when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) > 0 ) and ( upperswitchnames[item] | length ) > 1
119+
# when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference([switchnames[item]]) | length ) > 0 ) and ( upperswitchnames[item] | length ) > 1
120120
# run_once: true
121121
# delegate_to: 127.0.0.1
122122

123123
- name: change upper switch line from topology line
124124
lineinfile:
125125
path: "{{ slurm_conf_path }}/topology.conf"
126126
regexp: "SwitchName={{upperswitchnames[item]}}\\sSwitches.*"
127-
line: "SwitchName={{upperswitchnames[item]}} Switches={{racks_on_switch_dict[item] | difference(switchnames[item]) | join(',') }}"
127+
line: "SwitchName={{upperswitchnames[item]}} Switches={{racks_on_switch_dict[item] | difference([switchnames[item]]) | join(',') }}"
128128
state: present
129129
with_items: "{{unreachable_slurm_nodes}}"
130130
ignore_errors: yes
131-
when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) > 0 ) and ( upperswitchnames[item] | length ) > 1 and ( nodes_on_switch[item] | length ) < 2
131+
when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference([switchnames[item]]) | length ) > 0 ) and ( upperswitchnames[item] | length ) > 1 and ( nodes_on_switch[item] | length ) < 2
132132
run_once: true
133133
delegate_to: 127.0.0.1
134134

@@ -137,7 +137,7 @@
137137
# msg: "removing line line: SwitchName={{upperswitchnames[item]}}\\sSwitches.*"
138138
# with_items: "{{unreachable_slurm_nodes}}"
139139
# ignore_unreachable: yes
140-
# when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) == 0 ) and ( upperswitchnames[item] | length ) > 1
140+
# when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference([switchnames[item]]) | length ) == 0 ) and ( upperswitchnames[item] | length ) > 1
141141
# run_once: true
142142
# delegate_to: 127.0.0.1
143143

@@ -148,7 +148,7 @@
148148
state: absent
149149
with_items: "{{unreachable_slurm_nodes}}"
150150
ignore_unreachable: yes
151-
when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) == 0 ) and ( upperswitchnames[item] | length ) > 1 and ( nodes_on_switch[item] | length ) < 2
151+
when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference([switchnames[item]]) | length ) == 0 ) and ( upperswitchnames[item] | length ) > 1 and ( nodes_on_switch[item] | length ) < 2
152152
run_once: true
153153
delegate_to: 127.0.0.1
154154

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
NCCL_CROSS_NIC=0
2-
NCCL_SOCKET_NTHREADS=16
1+
NCCL_CROSS_NIC=1
32
NCCL_DEBUG=WARN
43
NCCL_CUMEM_ENABLE=0
54
NCCL_IB_SPLIT_DATA_ON_QPS=0
@@ -8,8 +7,8 @@ NCCL_IB_GID_INDEX=3
87
NCCL_IB_TC=41
98
NCCL_IB_SL=0
109
NCCL_IB_TIMEOUT=22
10+
NCCL_BUFFSIZE=16777216
1111
NCCL_NET_PLUGIN=none
1212
NCCL_SOCKET_IFNAME=eth0
1313
NCCL_IGNORE_CPU_AFFINITY=1
14-
NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17
15-
NCCL_TOPO_FILE=/nfs/cluster/H100-topology.xml
14+
NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17

playbooks/roles/nccl-conf/tasks/main.yml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,24 @@
3333
owner: root
3434
group: root
3535
mode: '0644'
36-
when: shape_nccl.stdout == '"BM.GPU4.8"'
36+
when: shape_nccl.stdout == '"BM.GPU4.8"'
37+
38+
- name: copy libnccl-ocituner for OL
39+
become: true
40+
get_url:
41+
url: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-OL
42+
dest: /home/opc/libnccl-ocituner.so.1.0.1
43+
owner: opc
44+
group: privilege
45+
mode: '0775'
46+
when: ( shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"' or shape_nccl.stdout == '"BM.GPU4.8"' ) and ansible_distribution == 'OracleLinux'
47+
48+
- name: copy libnccl-ocituner for Ubuntu
49+
become: true
50+
get_url:
51+
url: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-ubuntu
52+
dest: /home/ubuntu/libnccl-ocituner.so.1.0.1
53+
owner: ubuntu
54+
group: privilege
55+
mode: '0775'
56+
when: ( shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"' or shape_nccl.stdout == '"BM.GPU4.8"' ) and ansible_os_family == 'Debian'

playbooks/site.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,6 @@
6464
when: cluster_network|bool and not use_compute_agent|default(false)|bool
6565
- include_role:
6666
name: nvidia_peermem
67-
- include_role:
68-
name: nccl-conf
69-
when: cluster_network|bool
7067

7168
- hosts: controller
7269
become: true
@@ -264,6 +261,9 @@
264261
- include_role:
265262
name: hyperthreading
266263
when: not hyperthreading|default(false)|bool
264+
- include_role:
265+
name: nccl-conf
266+
when: cluster_network|bool
267267

268268
- hosts: all
269269
tasks:

samples/gpu/nccl_run_allreduce_H100.sbatch

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@ fi
4343
--bind-to numa \
4444
-npernode 8 \
4545
--mca coll ^hcoll \
46-
-x NCCL_CROSS_NIC=0 \
47-
-x NCCL_SOCKET_NTHREADS=16 \
46+
-x NCCL_CROSS_NIC=1 \
4847
-x NCCL_DEBUG=WARN \
4948
-x NCCL_CUMEM_ENABLE=0 \
5049
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
@@ -60,8 +59,11 @@ fi
6059
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
6160
-x RX_QUEUE_LEN=8192 \
6261
-x IB_RX_QUEUE_LEN=8192 \
62+
-x NCCL_BUFFSIZE=16777216 \
6363
-x NCCL_SOCKET_IFNAME=eth0 \
6464
-x NCCL_IGNORE_CPU_AFFINITY=1 \
6565
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
66-
-x NCCL_TOPO_FILE=~/H100-topology.xml \
67-
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
66+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
67+
68+
# If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances
69+
# -x NCCL_TOPO_FILE=~/H100-topology.xml \

samples/gpu/nccl_run_allreduce_H100.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ do
5757
--bind-to numa \
5858
-npernode 8 \
5959
--mca coll ^hcoll \
60-
-x NCCL_CROSS_NIC=0 \
61-
-x NCCL_SOCKET_NTHREADS=16 \
60+
-x NCCL_CROSS_NIC=1 \
6261
-x NCCL_DEBUG=WARN \
6362
-x NCCL_CUMEM_ENABLE=0 \
6463
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
@@ -74,6 +73,7 @@ do
7473
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
7574
-x RX_QUEUE_LEN=8192 \
7675
-x IB_RX_QUEUE_LEN=8192 \
76+
-x NCCL_BUFFSIZE=16777216 \
7777
-x NCCL_SOCKET_IFNAME=eth0 \
7878
-x NCCL_IGNORE_CPU_AFFINITY=1 \
7979
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
@@ -82,5 +82,8 @@ do
8282

8383
tail -n 32 $logfile
8484

85+
done
8586

86-
done
87+
88+
# If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances
89+
# -x NCCL_TOPO_FILE=~/H100-topology.xml \
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-slurm
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
10+
cd /nfs/cluster
11+
mkdir $SLURM_JOB_ID
12+
cd $SLURM_JOB_ID
13+
14+
MACHINEFILE="hostfile"
15+
ORDEREDMACHINEFILE="ordered_hostfile_system_name"
16+
ORDEREDRANKMACHINEFILE="rankfile_system_name"
17+
18+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
19+
echo MACHINEFILE
20+
cat $MACHINEFILE
21+
22+
source /etc/os-release
23+
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
24+
python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
25+
homedirectory=/home/opc
26+
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
27+
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
28+
homedirectory=/home/ubuntu
29+
fi
30+
31+
32+
echo ORDEREDMACHINEFILE
33+
cat $ORDEREDMACHINEFILE
34+
echo ORDEREDRANKMACHINEFILE
35+
cat $ORDEREDRANKMACHINEFILE
36+
37+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
38+
39+
if [[ "$mpivars_path" == "" ]]; then
40+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
41+
fi
42+
43+
if [[ "$mpivars_path" == "" ]]; then
44+
echo "Could not find MPIPATH"; exit; fi
45+
46+
source $mpivars_path
47+
48+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
49+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
50+
then
51+
var_UCX_NET_DEVICES=mlx5_0:1
52+
elif [ $shape == \"BM.GPU4.8\" ]
53+
then
54+
var_UCX_NET_DEVICES=mlx5_4:1
55+
fi
56+
57+
mpirun --mca pml ucx \
58+
--bind-to numa \
59+
--mca coll ^hcoll \
60+
-x UCX_TLS=ud,self,sm \
61+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
62+
-x HCOLL_ENABLE_MCAST_ALL=0 \
63+
-x coll_hcoll_enable=0 \
64+
-x NCCL_TUNER_PLUGIN=$homedirectory/libnccl-ocituner.so.1.0.1 \
65+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100

0 commit comments

Comments
 (0)