|
| 1 | +import os |
| 2 | +from datetime import datetime |
| 3 | +import argparse |
| 4 | +import subprocess |
| 5 | +import sys |
| 6 | + |
| 7 | + |
| 8 | +def getDateTime(): |
| 9 | + # datetime object containing current date and time |
| 10 | + now = datetime.now() |
| 11 | + dt_string = now.strftime("%m%d%Y%H%M%S") |
| 12 | + return dt_string |
| 13 | + |
| 14 | + |
| 15 | +# create directory to hold results |
| 16 | +def createDir(): |
| 17 | + # directory name |
| 18 | + directory = str("/tmp/" + getDateTime()) |
| 19 | + try: |
| 20 | + os.mkdir(directory) |
| 21 | + except OSError as error: |
| 22 | + print(error) |
| 23 | + sys.exit(-1) |
| 24 | + return directory |
| 25 | + |
| 26 | + |
| 27 | +# change ownership of all files to user so that the files can be copied |
| 28 | +def changeOwner(path): |
| 29 | + username = os.getlogin() |
| 30 | + cmd = f'sudo chown -R {username}:{username} {path}' |
| 31 | + run_cmd(cmd) |
| 32 | + |
| 33 | + |
| 34 | +def getSshableNodes(hosts, path): |
| 35 | + hosts_file = open(hosts, "r") |
| 36 | + ssh_list = path + "/" + "sshable" |
| 37 | + not_ssh_list = path + "/" + "notsshable" |
| 38 | + sshable = open(ssh_list, "a") |
| 39 | + notsshable = open(not_ssh_list, "a") |
| 40 | + for line in hosts_file: |
| 41 | + host = line.split() |
| 42 | + host_value = host[0] |
| 43 | + cmd = f'ssh -o ConnectTimeout=10 {host_value} "cat /etc/os-release | grep PRETTY_NAME"' |
| 44 | + isSshable = run_cmd(cmd) |
| 45 | + if not isSshable: |
| 46 | + notsshable.write(host_value) |
| 47 | + notsshable.write("\n") |
| 48 | + elif 'PRETTY_NAME' in isSshable[0]: |
| 49 | + sshable.write(host_value) |
| 50 | + sshable.write("\n") |
| 51 | + else: |
| 52 | + notsshable.write(host_value) |
| 53 | + notsshable.write("\n") |
| 54 | + sshable.close() |
| 55 | + notsshable.close() |
| 56 | + hosts_file.close() |
| 57 | + return ssh_list |
| 58 | + |
| 59 | + |
| 60 | +def run_cmd(cmd=None): |
| 61 | + """ Run command on shell""" |
| 62 | + try: |
| 63 | + results = subprocess.run(cmd, shell=True, executable='/bin/bash', stdout=subprocess.PIPE, |
| 64 | + stderr=subprocess.STDOUT, encoding='utf8') |
| 65 | + output = results.stdout.splitlines() |
| 66 | + except subprocess.CalledProcessError as e: |
| 67 | + print (f'Command {e.cmd} failed with error {e.returncode}') |
| 68 | + return e.returncode |
| 69 | + return output |
| 70 | + |
| 71 | + |
| 72 | +# get interfaces that are Down |
| 73 | +def ibdev(hosts, path): |
| 74 | + log_file = path + "/" + "ibdev2netdev" |
| 75 | + cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; ibdev2netdev | grep Down"; done > {log_file}' |
| 76 | + run_cmd(cmd) |
| 77 | + |
| 78 | + |
| 79 | +# get EAP-FAILURE |
| 80 | +def eapFailure(hosts, path): |
| 81 | + log_file = path + "/" + "eapfailure" |
| 82 | + cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; cat /var/log/syslog | grep "EAP-FAILURE""; done > {log_file}' |
| 83 | + run_cmd(cmd) |
| 84 | + |
| 85 | + |
| 86 | +# get rdma links authentication |
| 87 | +def rdmaAuth(hosts, path): |
| 88 | + log_file = path + "/" + "rdmaauth" |
| 89 | + hosts_file = open(hosts, "r") |
| 90 | + log_file = path + "/" + "rdmaauth" |
| 91 | + rdma_file = open(log_file, "a") |
| 92 | + for line in hosts_file: |
| 93 | + host = line.split() |
| 94 | + host_value = host[0] |
| 95 | + cmd = f'ssh {host_value} "hostname; hostname -i; sudo dmidecode -s system-serial-number"' |
| 96 | + output = run_cmd(cmd) |
| 97 | + for o in output: |
| 98 | + rdma_file.write(o) |
| 99 | + rdma_file.write("\n") |
| 100 | + cmd = f'ssh {host_value} \'for x in $(seq 0 15) ; do sudo wpa_cli -i rdma$x status | grep EAP ; done\'' |
| 101 | + output = run_cmd(cmd) |
| 102 | + for o in output: |
| 103 | + rdma_file.write(o) |
| 104 | + rdma_file.write("\n") |
| 105 | + rdma_file.close() |
| 106 | + hosts_file.close() |
| 107 | + |
| 108 | + |
| 109 | +# get logs for Link Flapping |
| 110 | +def linksDown(hosts, path): |
| 111 | + log_file = path + "/" + "linkflapping" |
| 112 | + cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; cat /var/log/syslog | grep "Link " | tail -36"; done > {log_file}' |
| 113 | + run_cmd(cmd) |
| 114 | + |
| 115 | + |
| 116 | +# Check any GPU fallen off the bus |
| 117 | +def lspci(hosts, path): |
| 118 | + log_file = path + "/" + "lspci" |
| 119 | + cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; lspci | grep "rev ff""; done > {log_file}' |
| 120 | + run_cmd(cmd) |
| 121 | + |
| 122 | + |
| 123 | +# Check for NVRM errors |
| 124 | +def nvrm(hosts, path): |
| 125 | + log_file = path + "/" + "nvrm" |
| 126 | + cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; sudo dmesg | grep NVRM"; done > {log_file}' |
| 127 | + run_cmd(cmd) |
| 128 | + |
| 129 | + |
| 130 | +# Check for Pending remaps |
| 131 | +def pending(hosts, path): |
| 132 | + log_file = path + "/" + "pending_remaps" |
| 133 | + cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; nvidia-smi -q | grep "Pending : Yes""; done > {log_file}' |
| 134 | + run_cmd(cmd) |
| 135 | + |
| 136 | + |
| 137 | +# Check for Remapping failures |
| 138 | +def remapping(hosts, path): |
| 139 | + log_file = path + "/" + "remapping_failures" |
| 140 | + cmd = f'for i in $(cat {hosts}); do ssh $i "hostname; hostname -i; sudo dmidecode -s system-serial-number; nvidia-smi -q | grep "Remapping Failure Occurred : Yes""; done > {log_file}' |
| 141 | + run_cmd(cmd) |
| 142 | + |
| 143 | + |
| 144 | +if __name__ == "__main__": |
| 145 | + parser = argparse.ArgumentParser(description = 'Capture H100 troubleshooting data.') |
| 146 | + parser.add_argument('--hosts', help = "Provide a filepath that contains list of either IPs / hostnames one per line on which you want to run this script.", required = True) |
| 147 | + args = parser.parse_args() |
| 148 | + hosts = args.hosts |
| 149 | + if hosts is None: |
| 150 | + print("Hostfile is required. Please provide one and run again.") |
| 151 | + sys.exit(-1) |
| 152 | + else: |
| 153 | + path = createDir() |
| 154 | + changeOwner(path) |
| 155 | + ssh_hosts = getSshableNodes(hosts, path) |
| 156 | + ibdev(ssh_hosts, path) |
| 157 | + eapFailure(ssh_hosts, path) |
| 158 | + rdmaAuth(ssh_hosts, path) |
| 159 | + linksDown(ssh_hosts, path) |
| 160 | + lspci(ssh_hosts, path) |
| 161 | + nvrm(ssh_hosts, path) |
| 162 | + pending(ssh_hosts, path) |
| 163 | + remapping(ssh_hosts, path) |
| 164 | + print("The results are at location: " + path) |
| 165 | + |
0 commit comments