diff --git a/oci/h100_health_checks/check_h100_setup.py b/oci/h100_health_checks/check_h100_setup.py index 9bcbe37..49c672b 100644 --- a/oci/h100_health_checks/check_h100_setup.py +++ b/oci/h100_health_checks/check_h100_setup.py @@ -4,8 +4,12 @@ import re import argparse from datetime import datetime -from shared_logging import logger +from common_logger import CommonLogger +from common_logger import runWithDummyValues from gpu_bw_test import BandwidthTest +from ecc_test import ECCTest +from gpu_remap_test import GPURemapTest +from rttcc_test import RTTCCTest from rdma_link_flapping import LinkFlappingTest from xid_checker import XidChecker import platform @@ -21,6 +25,8 @@ def get_metadata(): return requests.get(request_url, headers=headers).json() def is_user_root(): + if bool(runWithDummyValues): + return True # Check if the user is root if os.geteuid() != 0: logger.debug("User is root") @@ -75,126 +81,6 @@ def get_oca_version(): # Return the version return version -def check_rttcc_status(): - link_status = [] - devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] - status = "disabled" - status_dict = {"devices": {}} - for device in devices: - if not is_user_root(): - command = ['sudo', 'mlxreg', '-d', device, '-y', '--get', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0'] - else: - command = ['mlxreg', '-d', device, '-y', '--set', 'cmd_type=3', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0'] - result = subprocess.run(command, stdout=subprocess.PIPE) - output = result.stdout.decode('utf-8') - filtered_output = [line for line in output.split('\n') if line.startswith('value')] - for line in filtered_output: - logger.debug(line) - if "0x00000001" in line: - status_dict["devices"][device] = "enabled" - - for device in status_dict["devices"]: - if status_dict["devices"][device] == "enabled": - logger.warning(f"RTTCC enabled on {device}") - status = "enabled" - link_status.append(f"RTTCC enabled on: {device}") - else: - logger.info(f"RTTCC status for {device}: disabled") - if status == "disabled": - logger.info(f"RTTCC disabled check: Passed") - else: - logger.error(f"RTTCC disabled check: Failed") - - return link_status - -def check_ecc_errors(): - ecc_issues = [] - try: - # Run the nvidia-smi -q command - result = subprocess.run(['nvidia-smi', '-q'], stdout=subprocess.PIPE) - except FileNotFoundError: - logger.warning("Skipping SRAM/DRAM ECC Test: nvidia-smi command not found") - return [] - - # Decode the output from bytes to string - output = result.stdout.decode('utf-8') - - # Find the lines containing "SRAM Correctable" and "DRAM Correctable" - sram_matches = re.findall(r'SRAM Uncorrectable\s+:\s+(\d+)', output) - if len(sram_matches)==0: - sram_matches = re.findall(r'SRAM Uncorrectable Parity\s+:\s+(\d+)', output) - dram_matches = re.findall(r'DRAM Uncorrectable\s+:\s+(\d+)', output) - gpu_matches = re.findall(r'\nGPU\s+(.*)\n', output) - vol_sram_line = sram_matches[0::2] - vol_dram_line = dram_matches[0::2] - agg_sram_line = sram_matches[1::2] - agg_dram_line = dram_matches[1::2] - - for i, gpu in enumerate(gpu_matches): - logger.debug(f"GPU: {gpu}") - if vol_sram_line[i] != "0": - logger.debug(f"Volatile SRAM Uncorrectable: {vol_sram_line[i]}") - ecc_issues.append(f"{gpu_matches[i]} - Volatile SRAM Uncorrectable: {vol_sram_line[i]}") - if vol_dram_line[i] != "0": - logger.debug(f"Volatile DRAM Uncorrectable: {vol_dram_line[i]}") - ecc_issues.append(f"{gpu_matches[i]} - Volatile DRAM Uncorrectable: {vol_dram_line[i]}") - if agg_sram_line[i] != "0": - logger.debug(f"Aggregate SRAM Uncorrectable: {agg_sram_line[i]}") - ecc_issues.append(f"{gpu_matches[i]} - Aggregate SRAM Uncorrectable: {agg_sram_line[i]}") - if agg_dram_line[i] != "0": - logger.debug(f"Aggregate DRAM Uncorrectable: {agg_dram_line[i]}") - ecc_issues.append(f"{gpu_matches[i]} - Aggregate DRAM Uncorrectable: {agg_dram_line[i]}") - - - # Check if there are ecc_issues - if len(ecc_issues) == 0: - logger.info("GPU ECC Test: Passed") - else: - logger.warning("GPU ECC Test: Failed") - - return ecc_issues - -def check_row_remap_errors(): - remap_issues = [] - try: - # Run the nvidia-smi -q command - result = subprocess.run(['nvidia-smi', '--query-remapped-rows=remapped_rows.pending,remapped_rows.failure,remapped_rows.uncorrectable', '--format=csv,noheader'], stdout=subprocess.PIPE) - - if result.returncode != 0: - logger.debug(f"Check row remap command exited with error code: {result.returncode}") - - except FileNotFoundError: - logger.warning("Skipping Row Remap Test: nvidia-smi command not found") - return [] - - # Decode the output from bytes to string - output = result.stdout.decode('utf-8') - logger.debug("Output: {}".format(output)) - for i, line in enumerate(output.split('\n')): - if line == "": - continue - tmp_data = line.split(",") - tmp_data = [x.strip() for x in tmp_data] - if tmp_data[0] != "0": - logger.debug(f"GPU: {i} - Row Remap Pending: {tmp_data[0]}") - remap_issues.append(f"GPU: {i} Row Remap Pending: {tmp_data[0]}") - if tmp_data[1] != "0": - logger.debug(f"GPU: {i} - Row Remap Failure: {tmp_data[1]}") - #remap_issues.append(f"GPU: {i} Row Remap Failure: {tmp_data[1]}") - if tmp_data[2] != "0": - logger.debug(f"GPU: {i} - Row Remap Uncorrectable: {tmp_data[2]}") - if int(tmp_data[2]) > 512: - remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable >512: {tmp_data[2]}") - else: - remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable <512: {tmp_data[2]}")# Check if there are ecc_issues - - if len(remap_issues) == 0: - logger.info("GPU Remap Test: Passed") - else: - logger.warning("GPU Remap Test: Failed") - - return remap_issues - def check_rdma_link_status(): status = True metadata=get_metadata() @@ -260,6 +146,8 @@ def check_rdma_link_status(): return link_issues def get_host_serial(): + if (runWithDummyValues): + return "2349XLG02D" # Run the shell command if not is_user_root(): result = subprocess.run(['sudo', 'dmidecode', '-s', 'system-serial-number'], stdout=subprocess.PIPE) @@ -363,8 +251,19 @@ def slurm_reason(message): parser.add_argument('-slurm','--slurm', dest='slurm', action='store_true', default=False, help='Add a Slurm message') args = parser.parse_args() + logger = CommonLogger.getLogger("h100", None, None) logger.setLevel(args.log_level) + # Summarize the results + try: + host_serial = get_host_serial() + logger.setHostSerial(host_serial) + except Exception as e: + logger.warning(f"Failed to get host serial number with error: {e}") + host_serial = "Unknown" + + logger.info(f"--------- Starting Host setup check for {host_serial} ---------") + datetime_str = datetime.now().strftime('%Y-%m-%d-%H%M%S') logger.info(f"Started GPU host setup check at: {datetime_str}") try: @@ -372,22 +271,28 @@ def slurm_reason(message): except Exception as e: logger.warning(f"Failed to get Oracle Cloud Agent version with error: {e}") oca_version = "Unknown" + + rttc = None try: - rttcc_issues = check_rttcc_status() + rttc = RTTCCTest(is_user_root()) + rttcc_issues = rttc.check_rttcc_status() except Exception as e: logger.warning(f"Failed to check RTTCC status with error: {e}") rttcc_issues = [] # Check for ECC errors + ecc = None try: - ecc_issues = check_ecc_errors() + ecc = ECCTest() + ecc_issues = ecc.check_ecc_errors() except Exception as e: logger.warning(f"Failed to check ECC errors with error: {e}") ecc_issues = [] # Check for row remap errors + gpurremap = GPURemapTest() try: - remap_results = check_row_remap_errors() + remap_results = gpurremap.check_row_remap_errors() except Exception as e: logger.warning(f"Failed to check row remap errors with error: {e}") remap_results = [] @@ -409,6 +314,7 @@ def slurm_reason(message): lft_issues = {"failures": [], "link_down": []} # Check for GPU Xid errors + xc = None try: xc = XidChecker() xid_results = xc.check_gpu_xid() @@ -417,9 +323,10 @@ def slurm_reason(message): xid_results = {"status": "None", "results": {}} # Check GPU bandwidth + bwt = None bwt_results = None try: - if args.bw_test == True or args.run_all == True: + if bool(runWithDummyValues) or args.bw_test == True or args.run_all == True: if args.bw_test_exe: bwt = BandwidthTest(bw_test_exe=args.bw_test_exe) else: @@ -454,64 +361,60 @@ def slurm_reason(message): slurm_drain_reason = "" slurm_error_count = 0 + logger.set("h100", host_serial, None) logger.info(f"--------- Summary of Host setup check for {host_serial} ---------") - if oca_version < "1.39.0": + + if oca_version is None or oca_version == "Unknown" or oca_version < "1.39.0": logger.error(f"Oracle Cloud Agent: {oca_version} needs to be updated to 1.39.0 or higher") slurm_reason("OCA version Error") + if len(rttcc_issues) > 0: - logger.error(f"RTTCC issues: {rttcc_issues}") + rttc.logResults(rttcc_issues) slurm_reason("RTTCC Error") + if len(ecc_issues) > 0: - ecc_error=False - for issue in ecc_issues: - if "Skipped" in issue: - logger.warning(f"{host_serial} - {issue}") - else: - if "Aggregate" in issue: - logger.warning(f"{host_serial} - ECC issues: {issue}") - else: - logger.error(f"{host_serial} - ECC issues: {issue}") - ecc_error=True + ecc_error=ecc.logResults(ecc_issues, host_serial) if ecc_error: slurm_reason("ECC Error") + if len(remap_results) > 0: - remap_error=False - for issue in remap_results: - if "<512" in issue: - logger.warning(f"{host_serial} - {issue}") - else: - logger.error(f"{host_serial} - {issue}") - remap_error=True + remap_error=gpurremap.logResults(remap_results) if remap_error: slurm_reason("Remap Error") + if xid_results["status"] == "Failed": + xc.logResults(host_serial, xid_results) for xid in xid_results["results"]: for pci in xid_results["results"][xid]["results"]: - logger.error(f"{host_serial} - GPU Xid {xid} device: {pci}, {xid_results['results'][xid]['description']}") slurm_reason("XID Error") + if len(rdma_link_issues) > 0: for issue in rdma_link_issues: - logger.error(f"{host_serial} - RDMA link issues: {issue}") + logger.error(f"{issue}") slurm_reason("RDMA Link Error") + if len(lft_issues["failures"]) > 0 or len(lft_issues["link_down"]) > 0: + lft.logResults(lft_issues, host_serial); if len(lft_issues["failures"]) > 0: for issue in lft_issues["failures"]: - logger.error(f"{host_serial} - RDMA link flapping issues: {issue}") slurm_reason("RDMA Link Flapping Error") if len(lft_issues["link_down"]) > 0: for issue in lft_issues["link_down"]: - logger.error(f"{host_serial} - RDMA link down issues: {issue}") slurm_reason("RDMA Link Down Error") + if bwt_results != None: + bwt.logResults(host_serial, bwt_results) if bwt_results["status"] == "Failed": - for issue in bwt_results["issues"]: - logger.error(f"{host_serial} - GPU bandwidth issues: {issue}") - slurm_reason("GPU Bwt Error") + for device in bwt_results["devices"]: + for issue in bwt_results["devices"][device]: + slurm_reason("GPU Bwt Error") + if bus_results: - logger.error(f"{host_serial} - Bus issues: {bus_results}") + logger.error2("Bus issues", f"{bus_results}") slurm_reason("GPU Bus Error") + if gpu_results: - logger.error(f"{host_serial} - Missing GPU(s): {gpu_results}") + logger.error("Missing GPU(s)", f"{gpu_results}") slurm_reason("Missing GPU Error") datetime_str = datetime.now().strftime('%Y-%m-%d-%H%M%S') diff --git a/oci/h100_health_checks/common_logger.py b/oci/h100_health_checks/common_logger.py new file mode 100644 index 0000000..7d45fb5 --- /dev/null +++ b/oci/h100_health_checks/common_logger.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +import logging + +runWithDummyValues = 0 + +# Common Logger +# Log Pattern: {TIMEDATE}-\s([^\s\-]+)[\s\-]+(\S*)\,(\S*)\,(\S*)[\s\-]+(.*) +# e.g.: 2024-06-14 13:25:47,831 - ERROR - GPU BW,2349XLG02D,dev1 - DtoH: 50.0 is below threshold: 52.0 +# Usage : create instance of this logger +# Use setters to set various values before calling log methods +class CommonLogger: + + def setLevel(self, level): + self.logger.setLevel(level) + + def __init__(self, testname, hostSerial, device): + self.testname = None + self.hostSerial = None + self.device = None + self.logitem_seperator = "," + self.set(testname, hostSerial, device) + logging.basicConfig(level="INFO", format='%(asctime)s - %(levelname)s - %(message)s') + self.logger = logging.getLogger('nhc') + + @classmethod + def getLogger(cls, testname, hostSerial, device): + return CommonLogger(testname, hostSerial, device) + + def reset(self): + self.set(None, None, None) + + def set(self, testname, hostSerial, device): + self.setTestName(testname) + self.setHostSerial(hostSerial) + self.setDevice(device) + + def debug(self, msg, *args, **kwargs): + self.logger.debug(self.getMsg(msg), *args, **kwargs) + + def info(self, msg, *args, **kwargs): + self.logger.info(self.getMsg(msg), *args, **kwargs) + + def warning(self, msg, *args, **kwargs): + self.logger.warning(self.getMsg(msg), *args, **kwargs) + + def error(self, msg, *args, **kwargs): + self.logger.error(self.getMsg(msg), *args, **kwargs) + + def critical(self, msg, *args, **kwargs): + self.logger.critical(self.getMsg(msg), *args, **kwargs) + + def debug2(self, testName, msg, *args, **kwargs): + self.logger.debug(self.getMsg2(testName, msg), *args, **kwargs) + + def info2(self, testName, msg, *args, **kwargs): + self.logger.info(self.getMsg2(testName, msg), *args, **kwargs) + + def warning2(self, testName, msg, *args, **kwargs): + self.logger.warning(self.getMsg2(testName, msg), *args, **kwargs) + + def error2(self, testName, msg, *args, **kwargs): + self.logger.error(self.getMsg2(testName, msg), *args, **kwargs) + + def critical2(self, testName, msg, *args, **kwargs): + self.logger.critical(self.getMsg2(testName, msg), *args, **kwargs) + + def setTestName(self, tn): + if tn is None: + self.testname = "" + else: + self.testname = tn + + def setDevice(self, dv): + if dv is None: + self.device = "" + else: + self.device = dv + + def setHostSerial(self, hs): + if hs is None: + self.hostSerial = "" + else: + self.hostSerial = hs + + def getMsg(self, msg): + return self.getMsg2(self.testname, msg) + + def getMsg2(self, testName, msg): + return str(testName) + self.logitem_seperator + str(self.hostSerial) \ + + self.logitem_seperator + str(self.device) + " - " + str(msg) + + + + +#commons logger +#logger = CommonLogger("Main", None, None) + +if __name__ == '__main__': + logger = CommonLogger.getLogger(None, None, None) + logger.setLevel('DEBUG') + logger.critical("message comes here") + logger.error("message comes here") + logger.warning("message comes here") + logger.info("message comes here") + logger.setTestName("Test1") + logger.debug("message comes here") + logger.setHostSerial("serHost1") + logger.critical("message comes here") + logger.setDevice("dev1") + logger.critical2("test3", "message comes here") + logger.set("Test4", "hostSr1", "dev2") + logger.critical2("test3", "message comes here") diff --git a/oci/h100_health_checks/ecc_test.py b/oci/h100_health_checks/ecc_test.py new file mode 100644 index 0000000..69e2931 --- /dev/null +++ b/oci/h100_health_checks/ecc_test.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + +import re +import subprocess + +from common_logger import CommonLogger, runWithDummyValues + + +class ECCTest: + + def __init__(self): + self.testname = "ECC" + self.logger = CommonLogger.getLogger(self.testname, None, None) + + def check_ecc_errors(self): + ecc_issues = [] + if bool(runWithDummyValues): + return self.getDummyResults() + try: + # Run the nvidia-smi -q command + result = subprocess.run(['nvidia-smi', '-q'], stdout=subprocess.PIPE) + except FileNotFoundError: + self.logger.warning("Skipping SRAM/DRAM ECC Test: nvidia-smi command not found") + return [] + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + + # Find the lines containing "SRAM Correctable" and "DRAM Correctable" + sram_matches = re.findall(r'SRAM Uncorrectable\s+:\s+(\d+)', output) + if len(sram_matches)==0: + sram_matches = re.findall(r'SRAM Uncorrectable Parity\s+:\s+(\d+)', output) + dram_matches = re.findall(r'DRAM Uncorrectable\s+:\s+(\d+)', output) + gpu_matches = re.findall(r'\nGPU\s+(.*)\n', output) + vol_sram_line = sram_matches[0::2] + vol_dram_line = dram_matches[0::2] + agg_sram_line = sram_matches[1::2] + agg_dram_line = dram_matches[1::2] + + for i, gpu in enumerate(gpu_matches): + self.logger.debug(f"GPU: {gpu}") + if vol_sram_line[i] != "0": + self.logger.debug(f"Volatile SRAM Uncorrectable: {vol_sram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Volatile SRAM Uncorrectable: {vol_sram_line[i]}") + if vol_dram_line[i] != "0": + self.logger.debug(f"Volatile DRAM Uncorrectable: {vol_dram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Volatile DRAM Uncorrectable: {vol_dram_line[i]}") + if agg_sram_line[i] != "0": + self.logger.debug(f"Aggregate SRAM Uncorrectable: {agg_sram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Aggregate SRAM Uncorrectable: {agg_sram_line[i]}") + if agg_dram_line[i] != "0": + self.logger.debug(f"Aggregate DRAM Uncorrectable: {agg_dram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Aggregate DRAM Uncorrectable: {agg_dram_line[i]}") + + + # Check if there are ecc_issues + if len(ecc_issues) == 0: + self.logger.info("GPU ECC Test: Passed") + else: + self.logger.warning("GPU ECC Test: Failed") + + return ecc_issues + + def logResults(self, ecc_issues, host_serial): + ecc_error=False + if len(ecc_issues) > 0: + self.logger.setHostSerial(host_serial) + for issue in ecc_issues: + if "Skipped" in issue: + self.logger.warning(f"{issue}") + else: + if "Aggregate" in issue: + self.logger.warning(f"{issue}") + else: + self.logger.error(f"{issue}") + ecc_error=True + return ecc_error + + @classmethod + def getDummyResults(cls): + ecc_issues = [] + ecc_issues.append(f"GPU1 - Volatile SRAM Uncorrectable: VOL_LINE1") + ecc_issues.append(f"GPU1 - Aggregate DRAM Uncorrectable: AGG_LINE1") + return ecc_issues + +if __name__ == '__main__': + #runWithDummyValues = 1 + ecc = ECCTest() + ecc_issues = ecc.check_ecc_errors() + ecc.logResults(ecc_issues, "host_serial1") + diff --git a/oci/h100_health_checks/gpu_bw_test.py b/oci/h100_health_checks/gpu_bw_test.py index 369556c..6c3d3c7 100644 --- a/oci/h100_health_checks/gpu_bw_test.py +++ b/oci/h100_health_checks/gpu_bw_test.py @@ -6,12 +6,15 @@ import socket import time import json -from shared_logging import logger +from common_logger import CommonLogger +from common_logger import runWithDummyValues import re class BandwidthTest: def __init__(self, iteration=1, size=32000000, bw_test_exe="/opt/oci-hpc/cuda-samples/bin/x86_64/linux/release/bandwidthTest"): + self.testname = "GPU BW" + self.logger = CommonLogger.getLogger(self.testname, None, None) self.iteration = iteration self.size = size self.bw_test_exe = bw_test_exe @@ -32,6 +35,10 @@ def get_gpus(self): return len(filtered_output) def measure_gpu_bw(self): + self.logger.set(self.testname, None, None) + if bool(runWithDummyValues): + self.results = BandwidthTest.getDummyResults(); + return numas = 2 gpus = 8 iterations = 1 @@ -41,11 +48,11 @@ def measure_gpu_bw(self): numas = self.get_numa_nodes() gpus_per_numa = gpus // numas - logger.debug("GPUs: {}".format(gpus)) - logger.debug("NUMAs: {}".format(numas)) - logger.debug("GPUs per NUMA: {}".format(gpus_per_numa)) + self.logger.debug("GPUs: {}".format(gpus)) + self.logger.debug("NUMAs: {}".format(numas)) + self.logger.debug("GPUs per NUMA: {}".format(gpus_per_numa)) - logger.debug("Iteration: Device: DtoH : HtoD") + self.logger.debug("Iteration: Device: DtoH : HtoD") hostname = socket.gethostname() results = {"gpus": {}, "host": hostname} @@ -77,12 +84,12 @@ def measure_gpu_bw(self): for i in range(iterations): for device in range(gpus): os.environ["CUDA_VISIBLE_DEVICES"] = str(device) - logger.debug("ENV: {}".format(os.environ["CUDA_VISIBLE_DEVICES"])) - logger.debug("Iteration: {} Device: {} gpus_per_numa: {}".format(i, device, gpus_per_numa)) - logger.debug("CMD: {}".format(["numactl", "-N" + str(device // gpus_per_numa), "-m" + str(device // gpus_per_numa), self.bw_test_exe, "-dtoh"])) + self.logger.debug("ENV: {}".format(os.environ["CUDA_VISIBLE_DEVICES"])) + self.logger.debug("Iteration: {} Device: {} gpus_per_numa: {}".format(i, device, gpus_per_numa)) + self.logger.debug("CMD: {}".format(["numactl", "-N" + str(device // gpus_per_numa), "-m" + str(device // gpus_per_numa), self.bw_test_exe, "-dtoh"])) result = subprocess.run(["numactl", "-N" + str(device // gpus_per_numa), "-m" + str(device // gpus_per_numa), self.bw_test_exe, "-dtoh"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) - logger.debug("Output: {}".format(result.stdout)) - logger.debug("Error: {}".format(result.stderr)) + self.logger.debug("Output: {}".format(result.stdout)) + self.logger.debug("Error: {}".format(result.stderr)) if result.stdout.find(size) != -1: result = result.stdout.split("\n") tmp = [x for x in result if size in x] @@ -104,40 +111,60 @@ def measure_gpu_bw(self): results["gpus"][device]["dtoh"].append(dtoh) results["gpus"][device]["htod"].append(htod) - logger.debug(str(i) + " : " +str(device) + " : " + str(dtoh) + " : " + str(htod)) + self.logger.debug(str(i) + " : " +str(device) + " : " + str(dtoh) + " : " + str(htod)) if i > 1 and i != iterations - 1: # Sleep for 5 seconds and rerun time.sleep(5) - logger.debug(json.dumps(results)) + self.logger.debug(json.dumps(results)) self.results = results def validate_results(self): - gpu_issues = {"status": "Passed", "issues": []} + gpu_issues = {"status": "Passed", "devices": {}} if self.results == None: gpu_issues["issues"].append("GPU bandwidth test did not run since processes are running on the GPU") gpu_issues["status"] = "Failed" return gpu_issues status = True for device in self.results["gpus"]: + gpu_issues["devices"][device] = []; dtoh = self.results["gpus"][device]["dtoh"] htod = self.results["gpus"][device]["htod"] dtoh_avg = sum(dtoh) / len(dtoh) htod_avg = sum(htod) / len(htod) - logger.debug("Device: {} DtoH: {} HtoD: {}".format(device, dtoh_avg, htod_avg)) + self.logger.debug("Device: {} DtoH: {} HtoD: {}".format(device, dtoh_avg, htod_avg)) if dtoh_avg < self.dtoh_threshold: - logger.debug("Device: {} DtoH: {} is below threshold: {}".format(device, dtoh_avg, self.dtoh_threshold)) - gpu_issues["issues"].append("Device: {} DtoH: {} is below threshold: {}".format(device, dtoh_avg, self.dtoh_threshold)) + self.logger.debug("Device: {} DtoH: {} is below threshold: {}".format(device, dtoh_avg, self.dtoh_threshold)) + gpu_issues["devices"][device].append("DtoH: {} is below threshold: {}".format(dtoh_avg, self.dtoh_threshold)) gpu_issues["status"] = "Failed" if htod_avg < self.htod_threshold: - logger.debug("Device: {} HtoD: {} is below threshold: {}".format(device, htod_avg, self.htod_threshold)) - gpu_issues["issues"].append("Device: {} HtoD: {} is below threshold: {}".format(device, htod_avg, self.htod_threshold)) + self.logger.debug("Device: {} HtoD: {} is below threshold: {}".format(device, htod_avg, self.htod_threshold)) + gpu_issues["devices"][device].append("HtoD: {} is below threshold: {}".format(htod_avg, self.htod_threshold)) gpu_issues["status"] = "Failed" if gpu_issues["status"] == "Passed": - logger.info("GPU bandwidth test passed") + self.logger.info("GPU bandwidth test passed") return gpu_issues - + + def getTestName(self): + return self.testname + + def logResults(self, hostSerial, gpu_issues): + self.logger.setTestName(self.testname); + self.logger.setHostSerial(hostSerial) + if gpu_issues["status"] == "Failed": + for device in gpu_issues["devices"]: + for issue in gpu_issues["devices"][device]: + self.logger.setDevice(device) + self.logger.error2("GPU BW", f"{issue}") + + @staticmethod + def getDummyResults(): + dummyResults = {"gpus": {}, "host": "hostname"} + dummyResults["gpus"]["dev1"] = {"dtoh": [50], "htod": [44]} + dummyResults["gpus"]["dev2"] = {"dtoh": [52], "htod": [54]} + dummyResults["gpus"]["dev3"] = {"dtoh": [41], "htod": [40]} + return dummyResults if __name__ == '__main__': parser = argparse.ArgumentParser(description='Run GPU bandwidth test') @@ -147,7 +174,9 @@ def validate_results(self): parser.add_argument('--bw-test-exe', dest='bw_test_exe', default='/opt/oci-hpc/cuda-samples/bin/x86_64/linux/release/bandwidthTest', help='Path to the bw_test executable') args = parser.parse_args() + logger = CommonLogger.getLogger("GPU BW Test", "serial1", None); logger.setLevel(args.log_level) + logger.info("Test Started") if args.iterations != 'NONE': iterations = int(args.iterations) if args.size != 'NONE': @@ -157,11 +186,10 @@ def validate_results(self): bwt = BandwidthTest(iteration=iterations, size=size, bw_test_exe=bw_test_exe) bwt.measure_gpu_bw() + bwt_results = bwt.validate_results() - if bwt_results["status"] == "Passed": - logger.info("GPU bandwidth test passed") - else: - logger.error("GPU bandwidth test failed") - for issue in bwt_results["issues"]: - logger.error(issue) + bwt.logResults(None, bwt_results) + + logger.info("Test Ended") + diff --git a/oci/h100_health_checks/gpu_remap_test.py b/oci/h100_health_checks/gpu_remap_test.py new file mode 100644 index 0000000..3306120 --- /dev/null +++ b/oci/h100_health_checks/gpu_remap_test.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +import subprocess + +from common_logger import CommonLogger, runWithDummyValues + + +class GPURemapTest: + + def __init__(self): + self.testname = "GPU REMAP" + self.logger = CommonLogger.getLogger(self.testname, None, None) + + def check_row_remap_errors(self): + remap_issues = [] + if bool(runWithDummyValues): + return self.getDummyResults() + + try: + # Run the nvidia-smi -q command + result = subprocess.run(['nvidia-smi', '--query-remapped-rows=remapped_rows.pending,remapped_rows.failure,remapped_rows.uncorrectable', '--format=csv,noheader'], stdout=subprocess.PIPE) + + if result.returncode != 0: + self.logger.debug(f"Check row remap command exited with error code: {result.returncode}") + + except FileNotFoundError: + self.logger.warning("Skipping Row Remap Test: nvidia-smi command not found") + return [] + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + self.logger.debug("Output: {}".format(output)) + for i, line in enumerate(output.split('\n')): + if line == "": + continue + tmp_data = line.split(",") + tmp_data = [x.strip() for x in tmp_data] + if tmp_data[0] != "0": + self.logger.debug(f"GPU: {i} - Row Remap Pending: {tmp_data[0]}") + remap_issues.append(f"GPU: {i} Row Remap Pending: {tmp_data[0]}") + if tmp_data[1] != "0": + self.logger.debug(f"GPU: {i} - Row Remap Failure: {tmp_data[1]}") + #remap_issues.append(f"GPU: {i} Row Remap Failure: {tmp_data[1]}") + if tmp_data[2] != "0": + self.logger.debug(f"GPU: {i} - Row Remap Uncorrectable: {tmp_data[2]}") + if int(tmp_data[2]) > 512: + remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable >512: {tmp_data[2]}") + else: + remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable <512: {tmp_data[2]}")# Check if there are ecc_issues + + if len(remap_issues) == 0: + self.logger.info("GPU Remap Test: Passed") + else: + self.logger.warning("GPU Remap Test: Failed") + + return remap_issues + + + def logResults(self, remap_results): + remap_error=False + if len(remap_results) > 0: + for issue in remap_results: + if "<512" in issue: + self.logger.warning(f"{issue}") + else: + self.logger.error(f"{issue}") + remap_error=True + return remap_error + + @classmethod + def getDummyResults(cls): + remap_results = [] + remap_results.append(f"GPU: 1 - Row Remap Uncorrectable <512: 400") + remap_results.append(f"GPU: 2 - Row Remap Uncorrectable >512: 600") + return remap_results + +if __name__ == '__main__': + #runWithDummyValues = 1 + gpuremap = GPURemapTest() + remap_results = gpuremap.check_row_remap_errors() + gpuremap.logResults(remap_results) + diff --git a/oci/h100_health_checks/rdma_link_flapping.py b/oci/h100_health_checks/rdma_link_flapping.py index 2b4b8e8..6044c00 100644 --- a/oci/h100_health_checks/rdma_link_flapping.py +++ b/oci/h100_health_checks/rdma_link_flapping.py @@ -1,14 +1,13 @@ #!/usr/bin/env python3 import os -import sys import time import datetime import re import argparse -import socket import subprocess -from shared_logging import logger + +from common_logger import CommonLogger,runWithDummyValues class LinkFlappingTest: @@ -16,8 +15,9 @@ def __init__(self, time_interval=6): self.results = None self.time_interval = int(time_interval) self.link_data = None + self.testname = "LINK FLAP" + self.logger = CommonLogger.getLogger(self.testname, None, None) - # Check if the log file exists msg_file = "/var/log/messages" if not os.path.exists(msg_file): @@ -25,6 +25,8 @@ def __init__(self, time_interval=6): self.log_file = msg_file def get_rdma_link_failures(self): + if bool(runWithDummyValues): + return [] pattern = r"(\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})\s+\S+\s+wpa_supplicant(?:\[\d+\])?: (\w+): CTRL-EVENT-EAP-FAILURE EAP authentication failed" pattern2 = r"(\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})\s+\S+\s+kernel: (?:\[\d+\.\d+\]\s)?mlx5_core \S+ (\w+): Link down" @@ -36,30 +38,32 @@ def get_rdma_link_failures(self): if match: time_str = match.group(1) interface = match.group(2) - logger.debug(f"time: {time_str}, interface: {interface}") + self.logger.debug(f"time: {time_str}, interface: {interface}") if interface not in self.link_data: self.link_data[interface] = {"failures": [time_str], "link_down": []} else: self.link_data[interface]["failures"].append(time_str) - match = re.search(pattern2, line) if match: time_str = match.group(1) interface = match.group(2) - logger.debug(f"time: {time_str}, interface: {interface}") + self.logger.debug(f"time: {time_str}, interface: {interface}") if interface not in self.link_data: self.link_data[interface] = {"failures": [], "link_down": [time_str]} else: self.link_data[interface]["link_down"].append(time_str) - logger.debug("Link Data: {}".format(self.link_data)) + self.logger.debug("Link Data: {}".format(self.link_data)) return self.link_data def process_rdma_link_flapping(self): link_issues = {"failures": [], "link_down": []} + if bool(runWithDummyValues): + return self.getDummyResults() + # Get the time stamp when the host came up bootup_time = subprocess.run(['uptime', '-s'], stdout=subprocess.PIPE) bootup_time = bootup_time.stdout.decode('utf-8').strip() @@ -77,8 +81,8 @@ def process_rdma_link_flapping(self): for interface in self.link_data: if len(self.link_data[interface]["failures"]) > 0: link_failures = True - logger.debug(f"{interface}: {len(self.link_data[interface]['failures'])} RDMA link failure entries in {self.log_file}") - logger.debug(f"{interface}: {self.link_data[interface]['failures']}") + self.logger.debug(f"{interface}: {len(self.link_data[interface]['failures'])} RDMA link failure entries in {self.log_file}") + self.logger.debug(f"{interface}: {self.link_data[interface]['failures']}") last_date_failure_str = None if len(self.link_data[interface]["failures"]) > 0: @@ -99,18 +103,18 @@ def process_rdma_link_flapping(self): if last_date_failure_str != None and last_date_failure_str != current_date_str: diff_secs = current_date_sec - last_date_failure_sec diff_hours = diff_secs // (60 * 60) - logger.debug(f"RDMA link ({interface}) failed {diff_hours} hours ago") + self.logger.debug(f"RDMA link ({interface}) failed {diff_hours} hours ago") - logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}") + self.logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}") if diff_hours < self.time_interval and last_date_failure_sec > bootup_time_grace_period: - logger.debug(f"{interface}: one or more RDMA link flapping events within {self.time_interval} hours. Last flapping event: {last_date_failure_str})") + self.logger.debug(f"{interface}: one or more RDMA link flapping events within {self.time_interval} hours. Last flapping event: {last_date_failure_str})") link_issues["failures"].append(f"{interface}: {len(self.link_data[interface]['failures'])}") status = -1 for interface in self.link_data: if len(self.link_data[interface]["link_down"]) > 0: - logger.debug(f"{interface}: {len(self.link_data[interface]['link_down'])} RDMA link down entries in {self.log_file}") - logger.debug(f"{interface}: {self.link_data[interface]['link_down']}") + self.logger.debug(f"{interface}: {len(self.link_data[interface]['link_down'])} RDMA link down entries in {self.log_file}") + self.logger.debug(f"{interface}: {self.link_data[interface]['link_down']}") last_date_down_str = None if len(self.link_data[interface]["link_down"]) > 0: @@ -132,24 +136,40 @@ def process_rdma_link_flapping(self): if last_date_down_str != None and last_date_down_str != current_date_str: diff_secs = current_date_sec - last_date_down_sec diff_hours = diff_secs // (60 * 60) - logger.debug(f"RDMA link ({interface}) down {diff_hours} hours ago") + self.logger.debug(f"RDMA link ({interface}) down {diff_hours} hours ago") - logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}") + self.logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}") if diff_hours < self.time_interval and last_date_down_sec > bootup_time_grace_period: - logger.debug(f"{interface}, one or more RDMA link down events within {self.time_interval} hours. Last link down event: {last_date_down_str}") + self.logger.debug(f"{interface}, one or more RDMA link down events within {self.time_interval} hours. Last link down event: {last_date_down_str}") link_issues["link_down"].append(f"{interface}: {len(self.link_data[interface]['link_down'])}") status = -2 if status == -1: - logger.debug(f"One or more RDMA link flapping events within the past {self.time_interval} hours") + self.logger.debug(f"One or more RDMA link flapping events within the past {self.time_interval} hours") if status == -2: - logger.debug(f"One or more RDMA link down events within the past {self.time_interval} hours") + self.logger.debug(f"One or more RDMA link down events within the past {self.time_interval} hours") else: - logger.info("No RDMA link failures entry in /var/log/messages") + self.logger.info("No RDMA link failures entry in /var/log/messages") if status == 0: - logger.info("RDMA link flapping/down test: Passed") + self.logger.info("RDMA link flapping/down test: Passed") else: - logger.warning("RDMA link flapping/down test: Failed") + self.logger.warning("RDMA link flapping/down test: Failed") + return link_issues + + def logResults(self, lft_issues, host_serial): + self.logger.setHostSerial(host_serial) + if len(lft_issues["failures"]) > 0 or len(lft_issues["link_down"]) > 0: + if len(lft_issues["failures"]) > 0: + for issue in lft_issues["failures"]: + self.logger.error(f"{issue}") + if len(lft_issues["link_down"]) > 0: + for issue in lft_issues["link_down"]: + self.logger.error(f"RDMA link down issues: {issue}") + + def getDummyResults(self): + link_issues = {"failures": [], "link_down": []} + link_issues["failures"].append(f"eth1: failures reason 1") + link_issues["link_down"].append(f"eth1: link_down reason 1") return link_issues @@ -159,6 +179,7 @@ def process_rdma_link_flapping(self): parser.add_argument("-l", "--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Set the logging level") args = parser.parse_args() + logger = CommonLogger.getLogger("LINK FLAP", None, None); logger.setLevel(args.log_level) auth_failure_file = "/tmp/last_auth_failure_date" @@ -168,4 +189,5 @@ def process_rdma_link_flapping(self): time_interval_hours = 6 lft = LinkFlappingTest(time_interval=time_interval_hours) link_data = lft.get_rdma_link_failures() - lft.process_rdma_link_flapping() + issues = lft.process_rdma_link_flapping() + lft.logResults(issues, "host_seril100") diff --git a/oci/h100_health_checks/rttcc_test.py b/oci/h100_health_checks/rttcc_test.py new file mode 100644 index 0000000..7637d29 --- /dev/null +++ b/oci/h100_health_checks/rttcc_test.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +import subprocess + +from common_logger import CommonLogger, runWithDummyValues + + +class RTTCCTest: + + def __init__(self, inRoot): + self.testname = "RTTCC" + self.logger = CommonLogger.getLogger(self.testname, None, None) + self.inRoot = inRoot + + def check_rttcc_status(self): + if bool(runWithDummyValues): + return RTTCCTest.getDummyResults() + link_status = {} + devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + status = "disabled" + status_dict = {"devices": {}} + for device in devices: + if not self.inRoot: + command = ['sudo', 'mlxreg', '-d', device, '-y', '--get', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0'] + else: + command = ['mlxreg', '-d', device, '-y', '--set', 'cmd_type=3', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0'] + result = subprocess.run(command, stdout=subprocess.PIPE) + output = result.stdout.decode('utf-8') + filtered_output = [line for line in output.split('\n') if line.startswith('value')] + for line in filtered_output: + self.logger.debug(line) + if "0x00000001" in line: + status_dict["devices"][device] = "enabled" + + for device in status_dict["devices"]: + if status_dict["devices"][device] == "enabled": + self.logger.warning(f"RTTCC enabled on {device}") + status = "enabled" + if device not in link_status: + link_status[device] = [] + link_status[device].append(f"RTTCC enabled") + else: + self.logger.info(f"RTTCC status for {device}: disabled") + + self.logger.setDevice(None) + if status == "disabled": + self.logger.info(f"RTTCC disabled check: Passed") + else: + self.logger.error(f"RTTCC disabled check: Failed") + + return link_status + + def logResults(self, rttcc_issues): + if len(rttcc_issues) > 0: + for dev in rttcc_issues: + self.logger.setDevice(dev) + self.logger.error(f"{rttcc_issues[dev]}") + + @classmethod + def getDummyResults(cls): + devices = ["mlx5_0", "mlx5_1", "mlx5_3"] + ret = {} + for dev in devices: + ret[dev] = [] + ret[dev].append("RTTCC enabled") + return ret + +if __name__ == '__main__': + #runWithDummyValues = 1 + rt = RTTCCTest(True) + rttcc_issues = rt.check_rttcc_status() + rt.logResults(rttcc_issues) + diff --git a/oci/h100_health_checks/shared_logging.py b/oci/h100_health_checks/shared_logging.py index af87bc2..4ad8a0a 100644 --- a/oci/h100_health_checks/shared_logging.py +++ b/oci/h100_health_checks/shared_logging.py @@ -3,3 +3,4 @@ import logging logging.basicConfig(level="INFO", format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger('nhc') + diff --git a/oci/h100_health_checks/xid_checker.py b/oci/h100_health_checks/xid_checker.py index 490f7cb..92dcfcd 100644 --- a/oci/h100_health_checks/xid_checker.py +++ b/oci/h100_health_checks/xid_checker.py @@ -1,22 +1,23 @@ #!/usr/bin/env python3 import argparse -from shared_logging import logger +from common_logger import CommonLogger +from common_logger import runWithDummyValues import subprocess -import sys import re -import os + class XidChecker: def __init__(self, dmesg_cmd="dmesg", time_interval=60): + self.testname = "GPU Xid" + self.logger = CommonLogger.getLogger(self.testname, None, None) # if user is root - if not os.geteuid() == 0: - logger.info("The XidChecker script did not run since it must be run as root") - sys.exit(1) + # if not os.geteuid() == 0: + # logger.info("The XidChecker script did not run since it must be run as root") + # sys.exit(1) self.dmesg_cmd = dmesg_cmd - - self.results = {} + self.results = {} # Check for the following GPU Xid errors in dmesg self.XID_EC = { @@ -50,7 +51,7 @@ def __init__(self, dmesg_cmd="dmesg", time_interval=60): "28": {"description": "Video processor exception", "severity": "Warn"}, "29": {"description": "Video processor exception", "severity": "Warn"}, "30": {"description": "GPU semaphore access error", "severity": "Warn"}, - "31": {"description": "GPU memory page fault", "severity": "Critical"}, + "31": {"description": "GPU memory page fault", "severity": "Critical"}, "32": {"description": "Invalid or corrupted push buffer stream", "severity": "Warn"}, "33": {"description": "Internal micro-controller error", "severity": "Warn"}, "34": {"description": "Video processor exception", "severity": "Warn"}, @@ -67,7 +68,7 @@ def __init__(self, dmesg_cmd="dmesg", time_interval=60): "45": {"description": "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda applications and hitting a DBE", "severity": "Warn"}, "46": {"description": "GPU stopped processing", "severity": "Warn"}, "47": {"description": "Video processor exception", "severity": "Warn"}, - "48": {"description": "Double Bit ECC Error", "severity": "Critical"}, + "48": {"description": "Double Bit ECC Error", "severity": "Critical"}, "49": {"description": "Unused", "severity": "Warn"}, "50": {"description": "Unused", "severity": "Warn"}, "51": {"description": "Unused", "severity": "Warn"}, @@ -166,13 +167,17 @@ def __init__(self, dmesg_cmd="dmesg", time_interval=60): } def check_gpu_xid(self): + self.logger.set(self.testname, None, None) + if bool(runWithDummyValues): + return XidChecker.getDummyResults() status = "Pass" dmesg_output = subprocess.check_output([self.dmesg_cmd]).decode("utf-8") if "NVRM: Xid" in dmesg_output: for XID in self.XID_EC.keys(): - logger.debug(f"Checking for GPU Xid {XID} error in dmesg") - - matches = re.findall(f"NVRM: Xid \(PCI:(.*?): {XID},", dmesg_output) + self.logger.debug(f"Checking for GPU Xid {XID} error in dmesg") + + matches = re.findall(f"NVRM: Xid \(PCI:(.*?): {XID},", + dmesg_output) tmp_dict = {} for match in matches: if match not in tmp_dict: @@ -180,27 +185,58 @@ def check_gpu_xid(self): else: tmp_dict[match] = tmp_dict[match] + 1 for x in tmp_dict.keys(): - logger.info(f"{XID} : count: {tmp_dict[x]}, {self.XID_EC[XID]['description']} - PCI: {x}") + self.logger.info( + f"{XID} : count: {tmp_dict[x]}, {self.XID_EC[XID]['description']} - PCI: {x}") if not matches: - logger.debug(f"No GPU Xid {XID} error found in dmesg") + self.logger.debug(f"No GPU Xid {XID} error found in dmesg") if tmp_dict != {}: if self.XID_EC[XID]['severity'] == "Critical": status = "Failed" - self.results[XID] = {"results": tmp_dict, "description": self.XID_EC[XID]['description']} + self.results[XID] = {"results": tmp_dict, + "description": self.XID_EC[XID][ + 'description']} else: - logger.info("Xid Check: Passed") + self.logger.info("Xid Check: Passed") return {"status": status, "results": self.results} + def getTestName(self): + return self.testname + + def logResults(self, hostSerial, xid_results): + self.logger.set(self.testname, hostSerial, None) + if xid_results["status"] == "Failed": + self.logger.setTestName("GPU Xid") + for xid in xid_results["results"]: + for pci in xid_results["results"][xid]["results"]: + self.logger.setDevice(pci) + self.logger.error( + f"GPU Xid {xid}, {xid_results['results'][xid]['description']}") + @staticmethod + def getDummyResults(): + dummyResults = {"status": "Failed", "results": {}} + dummyResults["results"]["74"] = {} + dummyResults["results"]["74"]["results"] = {} + dummyResults["results"]["74"]["results"] = {"0000:89:00": 1} + dummyResults["results"]["74"][ + "description"] = "ECC page retirement or row remapping recording event" + return dummyResults + if __name__ == '__main__': # Argument parsing parser = argparse.ArgumentParser(description='Check for GPU Xid errors.') - parser.add_argument('--dmesg_cmd', default='dmesg', help='Dmesg file to check. Default is dmesg.') + parser.add_argument('--dmesg_cmd', default='dmesg', + help='Dmesg file to check. Default is dmesg.') args = parser.parse_args() - + logger = CommonLogger.getLogger(None, None, None) + logger.setTestName("GPU Xid") logger.debug(f"Using dmesg command: {args.dmesg_cmd}") - + xc = XidChecker(dmesg_cmd=args.dmesg_cmd) results = xc.check_gpu_xid() - logger.debug("Status: {}, Results: {}".format(results["status"], results["results"])) + + xc.logResults("X001", results) + # logger.error(f"GPU Xid {xid} device: {pci}, {xid_results['results'][xid]['description']}") + + # logger.debug("Status: {}, Results: {}".format(results["status"], results["results"]))