Skip to content

Commit 480407f

Browse files
committed
Merge branch '2.10.5' into 2.10.5_ds_nccl_conf
2 parents 96841dc + d491c24 commit 480407f

File tree

7 files changed

+192
-10
lines changed

7 files changed

+192
-10
lines changed

autoscaling/tf_init/locals.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ locals {
55
image_ocid = var.unsupported ? var.image_ocid : var.image
66

77
shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape
8-
instance_pool_ocpus = local.shape == "VM.DenseIO.E4.Flex" ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus
8+
instance_pool_ocpus = ( local.shape == "VM.DenseIO.E4.Flex" || local.shape == "VM.DenseIO.E5.Flex" ) ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus
99
// ips of the instances
1010
cluster_instances_ips = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.private_ip : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip
1111

bin/controller.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
4545
sudo ln -s /usr/local/bin/ansible /bin/ansible
4646
fi
4747
sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo
48+
sudo sed -i 's/$releasever/'"${vid}"'/g' /etc/yum.repos.d/hashicorp.repo
4849
sudo yum install -y terraform
4950
sudo python3 -m pip install -U pip
5051
sudo python3 -m pip install netaddr --upgrade

locals.tf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ locals {
88
custom_login_image_ocid = var.unsupported_login ? var.unsupported_login_image : var.custom_login_image
99

1010
shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape
11-
instance_pool_ocpus = local.shape == "VM.DenseIO.E4.Flex" ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus
12-
controller_ocpus = var.controller_shape == "VM.DenseIO.E4.Flex" ? var.controller_ocpus_denseIO_flex : var.controller_ocpus
13-
login_ocpus = var.login_shape == "VM.DenseIO.E4.Flex" ? var.login_ocpus_denseIO_flex : var.login_ocpus
11+
instance_pool_ocpus = ( local.shape == "VM.DenseIO.E4.Flex" || local.shape == "VM.DenseIO.E5.Flex" ) ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus
12+
controller_ocpus = ( var.controller_shape == "VM.DenseIO.E4.Flex" || var.controller_shape == "VM.DenseIO.E5.Flex" ) ? var.controller_ocpus_denseIO_flex : var.controller_ocpus
13+
login_ocpus = ( var.login_shape == "VM.DenseIO.E4.Flex" || var.login_shape == "VM.DenseIO.E5.Flex" ) ? var.login_ocpus_denseIO_flex : var.login_ocpus
1414
// ips of the instances
1515
cluster_instances_ips = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.private_ip : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip
1616

playbooks/roles/localdisk/tasks/common.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
- name: Get the number of NVMe's
77
set_fact:
8-
nvme_count: "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list | length}}"
8+
nvme_count: "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]|[1-9][0-9]n1') | list | length}}"
99

1010
- name: Create a LVM?
1111
set_fact:
@@ -18,15 +18,15 @@
1818
state: present
1919
label: gpt
2020
with_items:
21-
- "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list }}"
21+
- "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]|[1-9][0-9]n1') | list }}"
2222

2323
- name: create a filesystem
2424
filesystem:
2525
dev: "/dev/{{item}}p1"
2626
fstype: xfs
2727
opts: "-L locscratch{{item | replace('nvme','') | replace('n1','')}}"
2828
with_items:
29-
- "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list }}"
29+
- "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]|[1-9][0-9]n1') | list }}"
3030
when: not ( one_lv | bool )
3131

3232
- name: Mount local volume
@@ -37,7 +37,7 @@
3737
opts: defaults,noatime
3838
state: mounted
3939
with_items:
40-
- "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list }}"
40+
- "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]|[1-9][0-9]n1') | list }}"
4141
when: not ( one_lv | bool )
4242

4343
- name: "set permissions on {{ nvme_path_edited }}"
@@ -50,7 +50,7 @@
5050
group: "{{privilege_group_name}}"
5151
recurse: no
5252
with_items:
53-
- "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list }}"
53+
- "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]|[1-9][0-9]n1') | list }}"
5454
when: not ( one_lv | bool )
5555

5656
- name: Check for lvm devices
@@ -61,7 +61,7 @@
6161
- name: Create volume group
6262
lvg:
6363
vg: "vg_nvmes"
64-
pvs: "{{['/dev/']|product(hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list)|map('join', '') | join(',')}}"
64+
pvs: "{{['/dev/']|product(hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]|[1-9][0-9]n1') | list)|map('join', '') | join(',')}}"
6565

6666
- name: Create Logical volume
6767
lvol:

playbooks/roles/slurm/templates/slurm.conf.j2

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar
113113
NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }}
114114
{% elif instance.shape == "VM.DenseIO.E4.Flex" %}
115115
NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }}
116+
{% elif instance.shape == "VM.DenseIO.E5.Flex" %}
117+
NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }}
116118
{% elif instance.shape == "VM.Standard.A1.Flex" %}
117119
NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }}
118120
{% elif instance.shape == "BM.Standard.E3.128" and threadspercore == 1%}
@@ -127,6 +129,8 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar
127129
NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }}
128130
{% elif instance.shape == "BM.DenseIO.E4.128" and threadspercore == 2 %}
129131
NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }}
132+
{% elif instance.shape == "BM.DenseIO.E5.128" %}
133+
NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }}
130134
{% elif instance.shape == "BM.HPC2.36" %}
131135
NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }}
132136
{% elif instance.shape == "BM.HPC.E5.144" %}

schema.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,9 @@ variables:
314314
- eq:
315315
- ${controller_shape}
316316
- "VM.DenseIO.E4.Flex"
317+
- eq:
318+
- ${controller_shape}
319+
- "VM.DenseIO.E5.Flex"
317320
required: true
318321

319322
controller_custom_memory:
@@ -643,6 +646,9 @@ variables:
643646
- eq:
644647
- ${instance_pool_shape}
645648
- "VM.DenseIO.E4.Flex"
649+
- eq:
650+
- ${instance_pool_shape}
651+
- "VM.DenseIO.E5.Flex"
646652
required: true
647653

648654
instance_pool_custom_memory:
@@ -753,6 +759,9 @@ variables:
753759
title: "use compute agent"
754760
description: "Select if your image has the OCA agent rather than the oci-cn-auth package. The new marketplace images need the compute agent enabled."
755761
default: true
762+
visible:
763+
not:
764+
- ${use_marketplace_image}
756765

757766
compute_image_compartment:
758767
title: "compute image compartment"
@@ -1398,6 +1407,9 @@ variables:
13981407
- eq:
13991408
- ${login_shape}
14001409
- "VM.DenseIO.E4.Flex"
1410+
- eq:
1411+
- ${login_shape}
1412+
- "VM.DenseIO.E5.Flex"
14011413
- ${login_node}
14021414
required: true
14031415

scripts/h100_script.py

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import os
2+
from datetime import datetime
3+
import argparse
4+
import subprocess
5+
import sys
6+
7+
8+
def getDateTime():
9+
# datetime object containing current date and time
10+
now = datetime.now()
11+
dt_string = now.strftime("%m%d%Y%H%M%S")
12+
return dt_string
13+
14+
15+
# create directory to hold results
16+
def createDir():
17+
# directory name
18+
directory = str("/tmp/" + getDateTime())
19+
try:
20+
os.mkdir(directory)
21+
except OSError as error:
22+
print(error)
23+
sys.exit(-1)
24+
return directory
25+
26+
27+
# change ownership of all files to user so that the files can be copied
28+
def changeOwner(path):
29+
username = os.getlogin()
30+
cmd = f'sudo chown -R {username}:{username} {path}'
31+
run_cmd(cmd)
32+
33+
34+
def getSshableNodes(hosts, path):
35+
hosts_file = open(hosts, "r")
36+
ssh_list = path + "/" + "sshable"
37+
not_ssh_list = path + "/" + "notsshable"
38+
sshable = open(ssh_list, "a")
39+
notsshable = open(not_ssh_list, "a")
40+
for line in hosts_file:
41+
host = line.split()
42+
host_value = host[0]
43+
cmd = f'ssh -o ConnectTimeout=10 {host_value} "cat /etc/os-release | grep PRETTY_NAME"'
44+
isSshable = run_cmd(cmd)
45+
if not isSshable:
46+
notsshable.write(host_value)
47+
notsshable.write("\n")
48+
elif 'PRETTY_NAME' in isSshable[0]:
49+
sshable.write(host_value)
50+
sshable.write("\n")
51+
else:
52+
notsshable.write(host_value)
53+
notsshable.write("\n")
54+
sshable.close()
55+
notsshable.close()
56+
hosts_file.close()
57+
return ssh_list
58+
59+
60+
def run_cmd(cmd=None):
61+
""" Run command on shell"""
62+
try:
63+
results = subprocess.run(cmd, shell=True, executable='/bin/bash', stdout=subprocess.PIPE,
64+
stderr=subprocess.STDOUT, encoding='utf8')
65+
output = results.stdout.splitlines()
66+
except subprocess.CalledProcessError as e:
67+
print (f'Command {e.cmd} failed with error {e.returncode}')
68+
return e.returncode
69+
return output
70+
71+
72+
# get interfaces that are Down
73+
def ibdev(hosts, path):
74+
log_file = path + "/" + "ibdev2netdev"
75+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; ibdev2netdev | grep Down"; done > {log_file}'
76+
run_cmd(cmd)
77+
78+
79+
# get EAP-FAILURE
80+
def eapFailure(hosts, path):
81+
log_file = path + "/" + "eapfailure"
82+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; cat /var/log/syslog | grep "EAP-FAILURE""; done > {log_file}'
83+
run_cmd(cmd)
84+
85+
86+
# get rdma links authentication
87+
def rdmaAuth(hosts, path):
88+
log_file = path + "/" + "rdmaauth"
89+
hosts_file = open(hosts, "r")
90+
log_file = path + "/" + "rdmaauth"
91+
rdma_file = open(log_file, "a")
92+
for line in hosts_file:
93+
host = line.split()
94+
host_value = host[0]
95+
cmd = f'ssh {host_value} "hostname; hostname -i; sudo dmidecode -s system-serial-number"'
96+
output = run_cmd(cmd)
97+
for o in output:
98+
rdma_file.write(o)
99+
rdma_file.write("\n")
100+
cmd = f'ssh {host_value} \'for x in $(seq 0 15) ; do sudo wpa_cli -i rdma$x status | grep EAP ; done\''
101+
output = run_cmd(cmd)
102+
for o in output:
103+
rdma_file.write(o)
104+
rdma_file.write("\n")
105+
rdma_file.close()
106+
hosts_file.close()
107+
108+
109+
# get logs for Link Flapping
110+
def linksDown(hosts, path):
111+
log_file = path + "/" + "linkflapping"
112+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; cat /var/log/syslog | grep "Link " | tail -36"; done > {log_file}'
113+
run_cmd(cmd)
114+
115+
116+
# Check any GPU fallen off the bus
117+
def lspci(hosts, path):
118+
log_file = path + "/" + "lspci"
119+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; lspci | grep "rev ff""; done > {log_file}'
120+
run_cmd(cmd)
121+
122+
123+
# Check for NVRM errors
124+
def nvrm(hosts, path):
125+
log_file = path + "/" + "nvrm"
126+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; sudo dmesg | grep NVRM"; done > {log_file}'
127+
run_cmd(cmd)
128+
129+
130+
# Check for Pending remaps
131+
def pending(hosts, path):
132+
log_file = path + "/" + "pending_remaps"
133+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; nvidia-smi -q | grep "Pending : Yes""; done > {log_file}'
134+
run_cmd(cmd)
135+
136+
137+
# Check for Remapping failures
138+
def remapping(hosts, path):
139+
log_file = path + "/" + "remapping_failures"
140+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; nvidia-smi -q | grep "Remapping Failure Occurred : Yes""; done > {log_file}'
141+
run_cmd(cmd)
142+
143+
144+
if __name__ == "__main__":
145+
parser = argparse.ArgumentParser(description = 'Capture H100 troubleshooting data.')
146+
parser.add_argument('--hosts', help = "Provide a filepath that contains list of either IPs / hostnames one per line on which you want to run this script.", required = True)
147+
args = parser.parse_args()
148+
hosts = args.hosts
149+
if hosts is None:
150+
print("Hostfile is required. Please provide one and run again.")
151+
sys.exit(-1)
152+
else:
153+
path = createDir()
154+
changeOwner(path)
155+
ssh_hosts = getSshableNodes(hosts, path)
156+
ibdev(ssh_hosts, path)
157+
eapFailure(ssh_hosts, path)
158+
rdmaAuth(ssh_hosts, path)
159+
linksDown(ssh_hosts, path)
160+
lspci(ssh_hosts, path)
161+
nvrm(ssh_hosts, path)
162+
pending(ssh_hosts, path)
163+
remapping(ssh_hosts, path)
164+
print("The results are at location: " + path)
165+

0 commit comments

Comments
 (0)