Skip to content

Commit d6dc299

Browse files
Merge branch '2.10.5' of https://github.com/oci-hpc/oci-hpc-clusternetwork-dev into 2.10.5
2 parents 0b2949b + d491c24 commit d6dc299

File tree

1 file changed

+165
-0
lines changed

1 file changed

+165
-0
lines changed

scripts/h100_script.py

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import os
2+
from datetime import datetime
3+
import argparse
4+
import subprocess
5+
import sys
6+
7+
8+
def getDateTime():
9+
# datetime object containing current date and time
10+
now = datetime.now()
11+
dt_string = now.strftime("%m%d%Y%H%M%S")
12+
return dt_string
13+
14+
15+
# create directory to hold results
16+
def createDir():
17+
# directory name
18+
directory = str("/tmp/" + getDateTime())
19+
try:
20+
os.mkdir(directory)
21+
except OSError as error:
22+
print(error)
23+
sys.exit(-1)
24+
return directory
25+
26+
27+
# change ownership of all files to user so that the files can be copied
28+
def changeOwner(path):
29+
username = os.getlogin()
30+
cmd = f'sudo chown -R {username}:{username} {path}'
31+
run_cmd(cmd)
32+
33+
34+
def getSshableNodes(hosts, path):
35+
hosts_file = open(hosts, "r")
36+
ssh_list = path + "/" + "sshable"
37+
not_ssh_list = path + "/" + "notsshable"
38+
sshable = open(ssh_list, "a")
39+
notsshable = open(not_ssh_list, "a")
40+
for line in hosts_file:
41+
host = line.split()
42+
host_value = host[0]
43+
cmd = f'ssh -o ConnectTimeout=10 {host_value} "cat /etc/os-release | grep PRETTY_NAME"'
44+
isSshable = run_cmd(cmd)
45+
if not isSshable:
46+
notsshable.write(host_value)
47+
notsshable.write("\n")
48+
elif 'PRETTY_NAME' in isSshable[0]:
49+
sshable.write(host_value)
50+
sshable.write("\n")
51+
else:
52+
notsshable.write(host_value)
53+
notsshable.write("\n")
54+
sshable.close()
55+
notsshable.close()
56+
hosts_file.close()
57+
return ssh_list
58+
59+
60+
def run_cmd(cmd=None):
61+
""" Run command on shell"""
62+
try:
63+
results = subprocess.run(cmd, shell=True, executable='/bin/bash', stdout=subprocess.PIPE,
64+
stderr=subprocess.STDOUT, encoding='utf8')
65+
output = results.stdout.splitlines()
66+
except subprocess.CalledProcessError as e:
67+
print (f'Command {e.cmd} failed with error {e.returncode}')
68+
return e.returncode
69+
return output
70+
71+
72+
# get interfaces that are Down
73+
def ibdev(hosts, path):
74+
log_file = path + "/" + "ibdev2netdev"
75+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; ibdev2netdev | grep Down"; done > {log_file}'
76+
run_cmd(cmd)
77+
78+
79+
# get EAP-FAILURE
80+
def eapFailure(hosts, path):
81+
log_file = path + "/" + "eapfailure"
82+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; cat /var/log/syslog | grep "EAP-FAILURE""; done > {log_file}'
83+
run_cmd(cmd)
84+
85+
86+
# get rdma links authentication
87+
def rdmaAuth(hosts, path):
88+
log_file = path + "/" + "rdmaauth"
89+
hosts_file = open(hosts, "r")
90+
log_file = path + "/" + "rdmaauth"
91+
rdma_file = open(log_file, "a")
92+
for line in hosts_file:
93+
host = line.split()
94+
host_value = host[0]
95+
cmd = f'ssh {host_value} "hostname; hostname -i; sudo dmidecode -s system-serial-number"'
96+
output = run_cmd(cmd)
97+
for o in output:
98+
rdma_file.write(o)
99+
rdma_file.write("\n")
100+
cmd = f'ssh {host_value} \'for x in $(seq 0 15) ; do sudo wpa_cli -i rdma$x status | grep EAP ; done\''
101+
output = run_cmd(cmd)
102+
for o in output:
103+
rdma_file.write(o)
104+
rdma_file.write("\n")
105+
rdma_file.close()
106+
hosts_file.close()
107+
108+
109+
# get logs for Link Flapping
110+
def linksDown(hosts, path):
111+
log_file = path + "/" + "linkflapping"
112+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; cat /var/log/syslog | grep "Link " | tail -36"; done > {log_file}'
113+
run_cmd(cmd)
114+
115+
116+
# Check any GPU fallen off the bus
117+
def lspci(hosts, path):
118+
log_file = path + "/" + "lspci"
119+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; lspci | grep "rev ff""; done > {log_file}'
120+
run_cmd(cmd)
121+
122+
123+
# Check for NVRM errors
124+
def nvrm(hosts, path):
125+
log_file = path + "/" + "nvrm"
126+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; sudo dmesg | grep NVRM"; done > {log_file}'
127+
run_cmd(cmd)
128+
129+
130+
# Check for Pending remaps
131+
def pending(hosts, path):
132+
log_file = path + "/" + "pending_remaps"
133+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; nvidia-smi -q | grep "Pending : Yes""; done > {log_file}'
134+
run_cmd(cmd)
135+
136+
137+
# Check for Remapping failures
138+
def remapping(hosts, path):
139+
log_file = path + "/" + "remapping_failures"
140+
cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; nvidia-smi -q | grep "Remapping Failure Occurred : Yes""; done > {log_file}'
141+
run_cmd(cmd)
142+
143+
144+
if __name__ == "__main__":
145+
parser = argparse.ArgumentParser(description = 'Capture H100 troubleshooting data.')
146+
parser.add_argument('--hosts', help = "Provide a filepath that contains list of either IPs / hostnames one per line on which you want to run this script.", required = True)
147+
args = parser.parse_args()
148+
hosts = args.hosts
149+
if hosts is None:
150+
print("Hostfile is required. Please provide one and run again.")
151+
sys.exit(-1)
152+
else:
153+
path = createDir()
154+
changeOwner(path)
155+
ssh_hosts = getSshableNodes(hosts, path)
156+
ibdev(ssh_hosts, path)
157+
eapFailure(ssh_hosts, path)
158+
rdmaAuth(ssh_hosts, path)
159+
linksDown(ssh_hosts, path)
160+
lspci(ssh_hosts, path)
161+
nvrm(ssh_hosts, path)
162+
pending(ssh_hosts, path)
163+
remapping(ssh_hosts, path)
164+
print("The results are at location: " + path)
165+

0 commit comments

Comments
 (0)