Skip to content

Commit a961253

Browse files
committed
Merge branch '2.10.6' into 2.10.6_ds_disable_scratch
2 parents f08d5d9 + ddf95e7 commit a961253

14 files changed

+190
-121
lines changed

autoscaling/tf_init/cluster-network-configuration.tf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,13 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati
4141
name = "Compute HPC RDMA Auto-Configuration"
4242
desired_state = plugins_config.value
4343
}
44-
44+
}
45+
dynamic plugins_config {
46+
for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
47+
content {
48+
name = "Compute RDMA GPU Monitoring"
49+
desired_state = plugins_config.value
50+
}
4551
}
4652
}
4753
dynamic "platform_config" {

autoscaling/tf_init/instance-pool-configuration.tf

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,23 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" {
1818
user_data = base64encode(data.template_file.config.rendered)
1919
}
2020
agent_config {
21-
is_management_disabled = true
21+
22+
are_all_plugins_disabled = false
23+
is_management_disabled = true
24+
is_monitoring_disabled = false
25+
26+
plugins_config {
27+
desired_state = "DISABLED"
28+
name = "OS Management Service Agent"
29+
}
30+
dynamic plugins_config {
31+
for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
32+
content {
33+
name = "Compute RDMA GPU Monitoring"
34+
desired_state = plugins_config.value
35+
}
2236
}
37+
}
2338
shape = var.instance_pool_shape
2439

2540
dynamic "shape_config" {

bin/resize.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index
753753
if len(unreachable_instances):
754754
if not remove_unreachable:
755755
print("STDOUT: At least one unreachable node is in the inventory")
756+
print(unreachable_instances)
756757
print("STDOUT: Not doing anything")
757758
exit(1)
758759
else:

cluster-network-configuration.tf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,13 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati
4545
name = "Compute HPC RDMA Auto-Configuration"
4646
desired_state = plugins_config.value
4747
}
48-
48+
}
49+
dynamic plugins_config {
50+
for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
51+
content {
52+
name = "Compute RDMA GPU Monitoring"
53+
desired_state = plugins_config.value
54+
}
4955
}
5056
}
5157

compute-nodes.tf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,13 @@ resource "oci_core_instance" "compute_cluster_instances" {
4848
name = "Compute HPC RDMA Auto-Configuration"
4949
desired_state = plugins_config.value
5050
}
51-
51+
}
52+
dynamic plugins_config {
53+
for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
54+
content {
55+
name = "Compute RDMA GPU Monitoring"
56+
desired_state = plugins_config.value
57+
}
5258
}
5359
}
5460

instance-pool-configuration.tf

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,23 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" {
2222
user_data = base64encode(data.template_file.config.rendered)
2323
}
2424
agent_config {
25-
is_management_disabled = true
25+
26+
are_all_plugins_disabled = false
27+
is_management_disabled = true
28+
is_monitoring_disabled = false
29+
30+
plugins_config {
31+
desired_state = "DISABLED"
32+
name = "OS Management Service Agent"
33+
}
34+
dynamic plugins_config {
35+
for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
36+
content {
37+
name = "Compute RDMA GPU Monitoring"
38+
desired_state = plugins_config.value
39+
}
2640
}
41+
}
2742
shape = var.instance_pool_shape
2843

2944
dynamic "shape_config" {

playbooks/roles/healthchecks/files/check_gpu_setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def check_rdma_link_status():
248248
status = False
249249
if recommendation != "No issue was observed":
250250
logger.debug(f"{device}: {recommendation}")
251-
if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-09:
251+
if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-07:
252252
logger.debug(f"Recommandation is {recommendation} but the Physical error are low enough that it can be ignored")
253253
else :
254254
logger.debug(f"Recommandation is {recommendation} and the Physical error count is too high to be ignored: {physical_BER}")

playbooks/roles/nccl-conf/files/h100

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
NCCL_CROSS_NIC=1
1+
NCCL_CROSS_NIC=2
22
NCCL_DEBUG=WARN
33
NCCL_CUMEM_ENABLE=0
44
NCCL_IB_SPLIT_DATA_ON_QPS=0
5-
NCCL_IB_QPS_PER_CONNECTION=16
5+
NCCL_IB_QPS_PER_CONNECTION=1
66
NCCL_IB_GID_INDEX=3
77
NCCL_IB_TC=41
88
NCCL_IB_SL=0

playbooks/roles/nccl-conf/tasks/main.yml

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,24 +33,4 @@
3333
owner: root
3434
group: root
3535
mode: '0644'
36-
when: shape_nccl.stdout == '"BM.GPU4.8"'
37-
38-
- name: copy libnccl-ocituner for OL
39-
become: true
40-
get_url:
41-
url: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-OL
42-
dest: /home/opc/libnccl-ocituner.so.1.0.1
43-
owner: opc
44-
group: privilege
45-
mode: '0775'
46-
when: ( shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"' or shape_nccl.stdout == '"BM.GPU4.8"' ) and ansible_distribution == 'OracleLinux'
47-
48-
- name: copy libnccl-ocituner for Ubuntu
49-
become: true
50-
get_url:
51-
url: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-ubuntu
52-
dest: /home/ubuntu/libnccl-ocituner.so.1.0.1
53-
owner: ubuntu
54-
group: privilege
55-
mode: '0775'
56-
when: ( shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"' or shape_nccl.stdout == '"BM.GPU4.8"' ) and ansible_os_family == 'Debian'
36+
when: shape_nccl.stdout == '"BM.GPU4.8"'
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=nccl-allreduce-slurm
3+
#SBATCH --nodes=2
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --ntasks-per-node=8
6+
#SBATCH --exclusive
7+
export PMI_DEBUG=1
8+
9+
10+
cd /nfs/scratch
11+
mkdir $SLURM_JOB_ID
12+
cd $SLURM_JOB_ID
13+
14+
MACHINEFILE="hostfile"
15+
ORDEREDMACHINEFILE="ordered_hostfile_system_name"
16+
ORDEREDRANKMACHINEFILE="rankfile_system_name"
17+
18+
scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
19+
echo MACHINEFILE
20+
cat $MACHINEFILE
21+
22+
source /etc/os-release
23+
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
24+
python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
25+
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
26+
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
27+
fi
28+
29+
30+
echo ORDEREDMACHINEFILE
31+
cat $ORDEREDMACHINEFILE
32+
echo ORDEREDRANKMACHINEFILE
33+
cat $ORDEREDRANKMACHINEFILE
34+
35+
mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
36+
37+
if [[ "$mpivars_path" == "" ]]; then
38+
mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
39+
fi
40+
41+
if [[ "$mpivars_path" == "" ]]; then
42+
echo "Could not find MPIPATH"; exit; fi
43+
44+
source $mpivars_path
45+
46+
export NCCL_DEBUG=WARN
47+
48+
#mpirun -d --mca pml ucx -x SLURM_JOB_NODELIST=$host_list --bind-to numa -x NCCL_DEBUG=WARN -x NCCL_IB_SL=0 -x NCCL_IB_TC=41 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_GID_INDEX=3 -x NCCL_ALGO=Ring -x NCCL_TOPO_FILE=/home/opc/topo-flattened-b4.xml -x NCCL_IB_HCA="mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_16,mlx5_17,mlx5_18,mlx5_19" -x UCX_NET_DEVICES=mlx5_0:1 -x HCOLL_ENABLE_MCAST_ALL=0 -x coll_hcoll_enable=0 -x UCX_TLS=ud,self,sm -np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile rankfile_system_name /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
49+
# no need to pass: -x SLURM_JOB_NODELIST=$host_list
50+
51+
shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
52+
if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
53+
then
54+
var_UCX_NET_DEVICES=mlx5_0:1
55+
var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
56+
elif [ $shape == \"BM.GPU4.8\" ]
57+
then
58+
var_UCX_NET_DEVICES=mlx5_4:1
59+
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
60+
fi
61+
62+
NCCL_version=`sudo ldconfig -v 2>&1 | grep "libnccl.so" | tail -n1 | sed -r 's/^.*\.so\.//'`
63+
arr_NCCL=(${NCCL_version//./ })
64+
if [ ${arr_NCCL[2]} > 20 ]
65+
then
66+
tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.2.0.1
67+
else
68+
tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.1.0.2
69+
fi
70+
71+
72+
mpirun --mca pml ucx \
73+
--bind-to numa \
74+
--mca coll ^hcoll \
75+
-x NCCL_DEBUG=WARN \
76+
-x NCCL_IB_SL=0 \
77+
-x NCCL_IB_TC=41 \
78+
-x NCCL_IB_QPS_PER_CONNECTION=4 \
79+
-x UCX_TLS=ud,self,sm \
80+
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
81+
-x HCOLL_ENABLE_MCAST_ALL=0 \
82+
-x coll_hcoll_enable=0 \
83+
-x NCCL_IB_GID_INDEX=3 \
84+
-x NCCL_TUNER_PLUGIN=${tuner_path} \
85+
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
86+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
87+
88+

0 commit comments

Comments
 (0)