|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import argparse |
| 4 | +from shared_logging import logger |
| 5 | +import subprocess |
| 6 | +import sys |
| 7 | +import re |
| 8 | +import os |
| 9 | + |
| 10 | +class XidChecker: |
| 11 | + def __init__(self, dmesg_cmd="dmesg", time_interval=60): |
| 12 | + # if user is root |
| 13 | + if not os.geteuid() == 0: |
| 14 | + logger.info("The XidChecker script did not run since it must be run as root") |
| 15 | + sys.exit(1) |
| 16 | + self.dmesg_cmd = dmesg_cmd |
| 17 | + self.results = {} |
| 18 | + |
| 19 | + |
| 20 | + # Check for the following GPU Xid errors in dmesg |
| 21 | + self.XID_EC = { |
| 22 | + "1": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, |
| 23 | + "2": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, |
| 24 | + "3": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, |
| 25 | + "4": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, |
| 26 | + "5": {"description": "Unused", "severity": "Critical"}, |
| 27 | + "6": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, |
| 28 | + "7": {"description": "Invalid or corrupted push buffer address", "severity": "Critical"}, |
| 29 | + "8": {"description": "GPU stopped processing", "severity": "Critical"}, |
| 30 | + "9": {"description": "Driver error programming GPU", "severity": "Critical"}, |
| 31 | + "10": {"description": "Unused", "severity": "Critical"}, |
| 32 | + "11": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, |
| 33 | + "12": {"description": "Driver error handling GPU exception", "severity": "Critical"}, |
| 34 | + "13": {"description": "Graphics Engine Exception", "severity": "Critical"}, |
| 35 | + "14": {"description": "Unused", "severity": "Warn"}, |
| 36 | + "15": {"description": "Unused", "severity": "Warn"}, |
| 37 | + "16": {"description": "Display engine hung", "severity": "Warn"}, |
| 38 | + "17": {"description": "Unused", "severity": "Warn"}, |
| 39 | + "18": {"description": "Bus mastering disabled in PCI Config Space", "severity": "Warn"}, |
| 40 | + "19": {"description": "Display Engine error", "severity": "Warn"}, |
| 41 | + "20": {"description": "Invalid or corrupted Mpeg push buffer", "severity": "Warn"}, |
| 42 | + "21": {"description": "Invalid or corrupted Motion Estimation push buffer", "severity": "Warn"}, |
| 43 | + "22": {"description": "Invalid or corrupted Video Processor push buffer", "severity": "Warn"}, |
| 44 | + "23": {"description": "Unused", "severity": "Warn"}, |
| 45 | + "24": {"description": "GPU semaphore timeout", "severity": "Warn"}, |
| 46 | + "25": {"description": "Invalid or illegal push buffer stream", "severity": "Warn"}, |
| 47 | + "26": {"description": "Framebuffer timeout", "severity": "Warn"}, |
| 48 | + "27": {"description": "Video processor exception", "severity": "Warn"}, |
| 49 | + "28": {"description": "Video processor exception", "severity": "Warn"}, |
| 50 | + "29": {"description": "Video processor exception", "severity": "Warn"}, |
| 51 | + "30": {"description": "GPU semaphore access error", "severity": "Warn"}, |
| 52 | + "31": {"description": "GPU memory page fault", "severity": "Critical"}, |
| 53 | + "32": {"description": "Invalid or corrupted push buffer stream", "severity": "Warn"}, |
| 54 | + "33": {"description": "Internal micro-controller error", "severity": "Warn"}, |
| 55 | + "34": {"description": "Video processor exception", "severity": "Warn"}, |
| 56 | + "35": {"description": "Video processor exception", "severity": "Warn"}, |
| 57 | + "36": {"description": "Video processor exception", "severity": "Warn"}, |
| 58 | + "37": {"description": "Driver firmware error", "severity": "Warn"}, |
| 59 | + "38": {"description": "Driver firmware error", "severity": "Warn"}, |
| 60 | + "39": {"description": "Unused", "severity": "Warn"}, |
| 61 | + "40": {"description": "Unused", "severity": "Warn"}, |
| 62 | + "41": {"description": "Unused", "severity": "Warn"}, |
| 63 | + "42": {"description": "Video processor exception", "severity": "Warn"}, |
| 64 | + "43": {"description": "GPU stopped processing", "severity": "Warn"}, |
| 65 | + "44": {"description": "Graphics Engine fault during context switch", "severity": "Warn"}, |
| 66 | + "45": {"description": "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda applications and hitting a DBE", "severity": "Warn"}, |
| 67 | + "46": {"description": "GPU stopped processing", "severity": "Warn"}, |
| 68 | + "47": {"description": "Video processor exception", "severity": "Warn"}, |
| 69 | + "48": {"description": "Double Bit ECC Error", "severity": "Critical"}, |
| 70 | + "49": {"description": "Unused", "severity": "Warn"}, |
| 71 | + "50": {"description": "Unused", "severity": "Warn"}, |
| 72 | + "51": {"description": "Unused", "severity": "Warn"}, |
| 73 | + "52": {"description": "Unused", "severity": "Warn"}, |
| 74 | + "53": {"description": "Unused", "severity": "Warn"}, |
| 75 | + "54": {"description": "Auxiliary power is not connected to the GPU board", "severity": "Warn"}, |
| 76 | + "55": {"description": "Unused", "severity": "Warn"}, |
| 77 | + "56": {"description": "Display Engine error", "severity": "Critical"}, |
| 78 | + "57": {"description": "Error programming video memory interface", "severity": "Critical"}, |
| 79 | + "58": {"description": "Unstable video memory interface detected", "severity": "Critical"}, |
| 80 | + "59": {"description": "Internal micro-controller error (older drivers)", "severity": "Warn"}, |
| 81 | + "60": {"description": "Video processor exception", "severity": "Warn"}, |
| 82 | + "61": {"description": "Internal micro-controller breakpoint/warning (newer drivers)", "severity": "Warn"}, |
| 83 | + "62": {"description": "Internal micro-controller halt", "severity": "Critical"}, |
| 84 | + "63": {"description": "ECC page retirement or row remapping recording event", "severity": "Critical"}, |
| 85 | + "64": {"description": "ECC page retirement or row remapper recording failure", "severity": "Critical"}, |
| 86 | + "65": {"description": "Video processor exception", "severity": "Critical"}, |
| 87 | + "66": {"description": "Illegal access by driver", "severity": "Warn"}, |
| 88 | + "67": {"description": "Illegal access by driver", "severity": "Warn"}, |
| 89 | + "68": {"description": "NVDEC0 Exception", "severity": "Critical"}, |
| 90 | + "69": {"description": "Graphics Engine class error", "severity": "Critical"}, |
| 91 | + "70": {"description": "CE3: Unknown Error", "severity": "Warn"}, |
| 92 | + "71": {"description": "CE4: Unknown Error", "severity": "Warn"}, |
| 93 | + "72": {"description": "CE5: Unknown Error", "severity": "Warn"}, |
| 94 | + "73": {"description": "NVENC2 Error", "severity": "Critical"}, |
| 95 | + "74": {"description": "NVLINK Error", "severity": "Critical"}, |
| 96 | + "75": {"description": "CE6: Unknown Error", "severity": "Warn"}, |
| 97 | + "76": {"description": "CE7: Unknown Error", "severity": "Warn"}, |
| 98 | + "77": {"description": "CE8: Unknown Error", "severity": "Warn"}, |
| 99 | + "78": {"description": "vGPU Start Error", "severity": "Warn"}, |
| 100 | + "79": {"description": "GPU has fallen off the bus", "severity": "Critical"}, |
| 101 | + "80": {"description": "Corrupted data sent to GPU", "severity": "Critical"}, |
| 102 | + "81": {"description": "VGA Subsystem Error", "severity": "Critical"}, |
| 103 | + "82": {"description": "NVJPGO Error", "severity": "Warn"}, |
| 104 | + "83": {"description": "NVDEC1 Error", "severity": "Warn"}, |
| 105 | + "84": {"description": "NVDEC2 Error", "severity": "Warn"}, |
| 106 | + "85": {"description": "CE9: Unknown Error", "severity": "Warn"}, |
| 107 | + "86": {"description": "OFA Exception", "severity": "Warn"}, |
| 108 | + "87": {"description": "Reserved", "severity": "Warn"}, |
| 109 | + "88": {"description": "NVDEC3 Error", "severity": "Warn"}, |
| 110 | + "89": {"description": "NVDEC4 Error", "severity": "Warn"}, |
| 111 | + "90": {"description": "Reserved", "severity": "Warn"}, |
| 112 | + "91": {"description": "Reserved", "severity": "Warn"}, |
| 113 | + "92": {"description": "High single-bit ECC error rate", "severity": "Critical"}, |
| 114 | + "93": {"description": "Non-fatal violation of provisioned InfoROM wear limit", "severity": "Warn"}, |
| 115 | + "94": {"description": "Contained ECC error", "severity": "Critical"}, |
| 116 | + "95": {"description": "Uncontained ECC error", "severity": "Critical"}, |
| 117 | + "96": {"description": "NVDEC5 Error", "severity": "Warn"}, |
| 118 | + "97": {"description": "NVDEC6 Error", "severity": "Warn"}, |
| 119 | + "98": {"description": "NVDEC7 Error", "severity": "Warn"}, |
| 120 | + "99": {"description": "NVJPG1 Error", "severity": "Warn"}, |
| 121 | + "100": {"description": "NVJPG2 Error", "severity": "Warn"}, |
| 122 | + "101": {"description": "NVJPG3 Error", "severity": "Warn"}, |
| 123 | + "102": {"description": "NVJPG4 Error", "severity": "Warn"}, |
| 124 | + "103": {"description": "NVJPG5 Error", "severity": "Warn"}, |
| 125 | + "104": {"description": "NVJPG6 Error", "severity": "Warn"}, |
| 126 | + "105": {"description": "NVJPG7 Error", "severity": "Warn"}, |
| 127 | + "106": {"description": "SMBPBI Test Message", "severity": "Warn"}, |
| 128 | + "107": {"description": "SMBPBI Test Message Silent", "severity": "Warn"}, |
| 129 | + "108": {"description": "Reserved", "severity": "Warn"}, |
| 130 | + "109": {"description": "Context Switch Timeout Error", "severity": "Critical"}, |
| 131 | + "110": {"description": "Security Fault Error", "severity": "Warn"}, |
| 132 | + "111": {"description": "Display Bundle Error Event", "severity": "Warn"}, |
| 133 | + "112": {"description": "Display Supervisor Error", "severity": "Warn"}, |
| 134 | + "113": {"description": "DP Link Training Error", "severity": "Warn"}, |
| 135 | + "114": {"description": "Display Pipeline Underflow Error", "severity": "Warn"}, |
| 136 | + "115": {"description": "Display Core Channel Error", "severity": "Warn"}, |
| 137 | + "116": {"description": "Display Window Channel Error", "severity": "Warn"}, |
| 138 | + "117": {"description": "Display Cursor Channel Error", "severity": "Warn"}, |
| 139 | + "118": {"description": "Display Pixel Pipeline Error", "severity": "Warn"}, |
| 140 | + "119": {"description": "GSP RPC Timeout", "severity": "Critical"}, |
| 141 | + "120": {"description": "GSP Error", "severity": "Critical"}, |
| 142 | + "121": {"description": "C2C Link Error", "severity": "Critical"}, |
| 143 | + "122": {"description": "SPI PMU RPC Read Failure", "severity": "Warn"}, |
| 144 | + "123": {"description": "SPI PMU RPC Write Failure", "severity": "Warn"}, |
| 145 | + "124": {"description": "SPI PMU RPC Erase Failure", "severity": "Warn"}, |
| 146 | + "125": {"description": "Inforom FS Failure", "severity": "Warn"}, |
| 147 | + "126": {"description": "Reserved", "severity": "Warn"}, |
| 148 | + "127": {"description": "Reserved", "severity": "Warn"}, |
| 149 | + "128": {"description": "Reserved", "severity": "Warn"}, |
| 150 | + "129": {"description": "Reserved", "severity": "Warn"}, |
| 151 | + "130": {"description": "Reserved", "severity": "Warn"}, |
| 152 | + "131": {"description": "Reserved", "severity": "Warn"}, |
| 153 | + "132": {"description": "Reserved", "severity": "Warn"}, |
| 154 | + "133": {"description": "Reserved", "severity": "Warn"}, |
| 155 | + "134": {"description": "Reserved", "severity": "Warn"}, |
| 156 | + "135": {"description": "Reserved", "severity": "Warn"}, |
| 157 | + "136": {"description": "Reserved", "severity": "Warn"}, |
| 158 | + "137": {"description": "Reserved", "severity": "Warn"}, |
| 159 | + "138": {"description": "Reserved", "severity": "Warn"}, |
| 160 | + "139": {"description": "Reserved", "severity": "Warn"}, |
| 161 | + "140": {"description": "Unrecovered ECC Error", "severity": "Warn"}, |
| 162 | + "141": {"description": "Reserved", "severity": "Warn"}, |
| 163 | + "142": {"description": "Reserved", "severity": "Warn"}, |
| 164 | + "143": {"description": "GPU Initialization Failure", "severity": "Warn"} |
| 165 | + } |
| 166 | + |
| 167 | + def check_gpu_xid(self): |
| 168 | + status = "Pass" |
| 169 | + dmesg_output = subprocess.check_output([self.dmesg_cmd]).decode("utf-8") |
| 170 | + if "NVRM: Xid" in dmesg_output: |
| 171 | + for XID in self.XID_EC.keys(): |
| 172 | + logger.debug(f"Checking for GPU Xid {XID} error in dmesg") |
| 173 | + |
| 174 | + matches = re.findall(f"NVRM: Xid \(PCI:(.*?): {XID},", dmesg_output) |
| 175 | + tmp_dict = {} |
| 176 | + for match in matches: |
| 177 | + if match not in tmp_dict: |
| 178 | + tmp_dict[match] = 1 |
| 179 | + else: |
| 180 | + tmp_dict[match] = tmp_dict[match] + 1 |
| 181 | + for x in tmp_dict.keys(): |
| 182 | + logger.info(f"{XID} : count: {tmp_dict[x]}, {self.XID_EC[XID]['description']} - PCI: {x}") |
| 183 | + if not matches: |
| 184 | + logger.debug(f"No GPU Xid {XID} error found in dmesg") |
| 185 | + if tmp_dict != {}: |
| 186 | + if self.XID_EC[XID]['severity'] == "Critical": |
| 187 | + status = "Failed" |
| 188 | + self.results[XID] = {"results": tmp_dict, "description": self.XID_EC[XID]['description']} |
| 189 | + else: |
| 190 | + logger.info("Xid Check: Passed") |
| 191 | + return {"status": status, "results": self.results} |
| 192 | + |
| 193 | + |
| 194 | +if __name__ == '__main__': |
| 195 | + # Argument parsing |
| 196 | + parser = argparse.ArgumentParser(description='Check for GPU Xid errors.') |
| 197 | + parser.add_argument('--dmesg_cmd', default='dmesg', help='Dmesg file to check. Default is dmesg.') |
| 198 | + args = parser.parse_args() |
| 199 | + |
| 200 | + |
| 201 | + logger.debug(f"Using dmesg command: {args.dmesg_cmd}") |
| 202 | + |
| 203 | + xc = XidChecker(dmesg_cmd=args.dmesg_cmd) |
| 204 | + results = xc.check_gpu_xid() |
| 205 | + logger.debug("Status: {}, Results: {}".format(results["status"], results["results"])) |
0 commit comments