Skip to content

Commit 9ee6ca0

Browse files
committed
add xid checker to metrics exporter
1 parent 7a51555 commit 9ee6ca0

File tree

2 files changed

+206
-0
lines changed

2 files changed

+206
-0
lines changed
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
from shared_logging import logger
5+
import subprocess
6+
import sys
7+
import re
8+
import os
9+
10+
class XidChecker:
11+
def __init__(self, dmesg_cmd="dmesg", time_interval=60):
12+
# if user is root
13+
if not os.geteuid() == 0:
14+
logger.info("The XidChecker script did not run since it must be run as root")
15+
sys.exit(1)
16+
self.dmesg_cmd = dmesg_cmd
17+
self.results = {}
18+
19+
20+
# Check for the following GPU Xid errors in dmesg
21+
self.XID_EC = {
22+
"1": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
23+
"2": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
24+
"3": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
25+
"4": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
26+
"5": {"description": "Unused", "severity": "Critical"},
27+
"6": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
28+
"7": {"description": "Invalid or corrupted push buffer address", "severity": "Critical"},
29+
"8": {"description": "GPU stopped processing", "severity": "Critical"},
30+
"9": {"description": "Driver error programming GPU", "severity": "Critical"},
31+
"10": {"description": "Unused", "severity": "Critical"},
32+
"11": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
33+
"12": {"description": "Driver error handling GPU exception", "severity": "Critical"},
34+
"13": {"description": "Graphics Engine Exception", "severity": "Critical"},
35+
"14": {"description": "Unused", "severity": "Warn"},
36+
"15": {"description": "Unused", "severity": "Warn"},
37+
"16": {"description": "Display engine hung", "severity": "Warn"},
38+
"17": {"description": "Unused", "severity": "Warn"},
39+
"18": {"description": "Bus mastering disabled in PCI Config Space", "severity": "Warn"},
40+
"19": {"description": "Display Engine error", "severity": "Warn"},
41+
"20": {"description": "Invalid or corrupted Mpeg push buffer", "severity": "Warn"},
42+
"21": {"description": "Invalid or corrupted Motion Estimation push buffer", "severity": "Warn"},
43+
"22": {"description": "Invalid or corrupted Video Processor push buffer", "severity": "Warn"},
44+
"23": {"description": "Unused", "severity": "Warn"},
45+
"24": {"description": "GPU semaphore timeout", "severity": "Warn"},
46+
"25": {"description": "Invalid or illegal push buffer stream", "severity": "Warn"},
47+
"26": {"description": "Framebuffer timeout", "severity": "Warn"},
48+
"27": {"description": "Video processor exception", "severity": "Warn"},
49+
"28": {"description": "Video processor exception", "severity": "Warn"},
50+
"29": {"description": "Video processor exception", "severity": "Warn"},
51+
"30": {"description": "GPU semaphore access error", "severity": "Warn"},
52+
"31": {"description": "GPU memory page fault", "severity": "Critical"},
53+
"32": {"description": "Invalid or corrupted push buffer stream", "severity": "Warn"},
54+
"33": {"description": "Internal micro-controller error", "severity": "Warn"},
55+
"34": {"description": "Video processor exception", "severity": "Warn"},
56+
"35": {"description": "Video processor exception", "severity": "Warn"},
57+
"36": {"description": "Video processor exception", "severity": "Warn"},
58+
"37": {"description": "Driver firmware error", "severity": "Warn"},
59+
"38": {"description": "Driver firmware error", "severity": "Warn"},
60+
"39": {"description": "Unused", "severity": "Warn"},
61+
"40": {"description": "Unused", "severity": "Warn"},
62+
"41": {"description": "Unused", "severity": "Warn"},
63+
"42": {"description": "Video processor exception", "severity": "Warn"},
64+
"43": {"description": "GPU stopped processing", "severity": "Warn"},
65+
"44": {"description": "Graphics Engine fault during context switch", "severity": "Warn"},
66+
"45": {"description": "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda applications and hitting a DBE", "severity": "Warn"},
67+
"46": {"description": "GPU stopped processing", "severity": "Warn"},
68+
"47": {"description": "Video processor exception", "severity": "Warn"},
69+
"48": {"description": "Double Bit ECC Error", "severity": "Critical"},
70+
"49": {"description": "Unused", "severity": "Warn"},
71+
"50": {"description": "Unused", "severity": "Warn"},
72+
"51": {"description": "Unused", "severity": "Warn"},
73+
"52": {"description": "Unused", "severity": "Warn"},
74+
"53": {"description": "Unused", "severity": "Warn"},
75+
"54": {"description": "Auxiliary power is not connected to the GPU board", "severity": "Warn"},
76+
"55": {"description": "Unused", "severity": "Warn"},
77+
"56": {"description": "Display Engine error", "severity": "Critical"},
78+
"57": {"description": "Error programming video memory interface", "severity": "Critical"},
79+
"58": {"description": "Unstable video memory interface detected", "severity": "Critical"},
80+
"59": {"description": "Internal micro-controller error (older drivers)", "severity": "Warn"},
81+
"60": {"description": "Video processor exception", "severity": "Warn"},
82+
"61": {"description": "Internal micro-controller breakpoint/warning (newer drivers)", "severity": "Warn"},
83+
"62": {"description": "Internal micro-controller halt", "severity": "Critical"},
84+
"63": {"description": "ECC page retirement or row remapping recording event", "severity": "Critical"},
85+
"64": {"description": "ECC page retirement or row remapper recording failure", "severity": "Critical"},
86+
"65": {"description": "Video processor exception", "severity": "Critical"},
87+
"66": {"description": "Illegal access by driver", "severity": "Warn"},
88+
"67": {"description": "Illegal access by driver", "severity": "Warn"},
89+
"68": {"description": "NVDEC0 Exception", "severity": "Critical"},
90+
"69": {"description": "Graphics Engine class error", "severity": "Critical"},
91+
"70": {"description": "CE3: Unknown Error", "severity": "Warn"},
92+
"71": {"description": "CE4: Unknown Error", "severity": "Warn"},
93+
"72": {"description": "CE5: Unknown Error", "severity": "Warn"},
94+
"73": {"description": "NVENC2 Error", "severity": "Critical"},
95+
"74": {"description": "NVLINK Error", "severity": "Critical"},
96+
"75": {"description": "CE6: Unknown Error", "severity": "Warn"},
97+
"76": {"description": "CE7: Unknown Error", "severity": "Warn"},
98+
"77": {"description": "CE8: Unknown Error", "severity": "Warn"},
99+
"78": {"description": "vGPU Start Error", "severity": "Warn"},
100+
"79": {"description": "GPU has fallen off the bus", "severity": "Critical"},
101+
"80": {"description": "Corrupted data sent to GPU", "severity": "Critical"},
102+
"81": {"description": "VGA Subsystem Error", "severity": "Critical"},
103+
"82": {"description": "NVJPGO Error", "severity": "Warn"},
104+
"83": {"description": "NVDEC1 Error", "severity": "Warn"},
105+
"84": {"description": "NVDEC2 Error", "severity": "Warn"},
106+
"85": {"description": "CE9: Unknown Error", "severity": "Warn"},
107+
"86": {"description": "OFA Exception", "severity": "Warn"},
108+
"87": {"description": "Reserved", "severity": "Warn"},
109+
"88": {"description": "NVDEC3 Error", "severity": "Warn"},
110+
"89": {"description": "NVDEC4 Error", "severity": "Warn"},
111+
"90": {"description": "Reserved", "severity": "Warn"},
112+
"91": {"description": "Reserved", "severity": "Warn"},
113+
"92": {"description": "High single-bit ECC error rate", "severity": "Critical"},
114+
"93": {"description": "Non-fatal violation of provisioned InfoROM wear limit", "severity": "Warn"},
115+
"94": {"description": "Contained ECC error", "severity": "Critical"},
116+
"95": {"description": "Uncontained ECC error", "severity": "Critical"},
117+
"96": {"description": "NVDEC5 Error", "severity": "Warn"},
118+
"97": {"description": "NVDEC6 Error", "severity": "Warn"},
119+
"98": {"description": "NVDEC7 Error", "severity": "Warn"},
120+
"99": {"description": "NVJPG1 Error", "severity": "Warn"},
121+
"100": {"description": "NVJPG2 Error", "severity": "Warn"},
122+
"101": {"description": "NVJPG3 Error", "severity": "Warn"},
123+
"102": {"description": "NVJPG4 Error", "severity": "Warn"},
124+
"103": {"description": "NVJPG5 Error", "severity": "Warn"},
125+
"104": {"description": "NVJPG6 Error", "severity": "Warn"},
126+
"105": {"description": "NVJPG7 Error", "severity": "Warn"},
127+
"106": {"description": "SMBPBI Test Message", "severity": "Warn"},
128+
"107": {"description": "SMBPBI Test Message Silent", "severity": "Warn"},
129+
"108": {"description": "Reserved", "severity": "Warn"},
130+
"109": {"description": "Context Switch Timeout Error", "severity": "Critical"},
131+
"110": {"description": "Security Fault Error", "severity": "Warn"},
132+
"111": {"description": "Display Bundle Error Event", "severity": "Warn"},
133+
"112": {"description": "Display Supervisor Error", "severity": "Warn"},
134+
"113": {"description": "DP Link Training Error", "severity": "Warn"},
135+
"114": {"description": "Display Pipeline Underflow Error", "severity": "Warn"},
136+
"115": {"description": "Display Core Channel Error", "severity": "Warn"},
137+
"116": {"description": "Display Window Channel Error", "severity": "Warn"},
138+
"117": {"description": "Display Cursor Channel Error", "severity": "Warn"},
139+
"118": {"description": "Display Pixel Pipeline Error", "severity": "Warn"},
140+
"119": {"description": "GSP RPC Timeout", "severity": "Critical"},
141+
"120": {"description": "GSP Error", "severity": "Critical"},
142+
"121": {"description": "C2C Link Error", "severity": "Critical"},
143+
"122": {"description": "SPI PMU RPC Read Failure", "severity": "Warn"},
144+
"123": {"description": "SPI PMU RPC Write Failure", "severity": "Warn"},
145+
"124": {"description": "SPI PMU RPC Erase Failure", "severity": "Warn"},
146+
"125": {"description": "Inforom FS Failure", "severity": "Warn"},
147+
"126": {"description": "Reserved", "severity": "Warn"},
148+
"127": {"description": "Reserved", "severity": "Warn"},
149+
"128": {"description": "Reserved", "severity": "Warn"},
150+
"129": {"description": "Reserved", "severity": "Warn"},
151+
"130": {"description": "Reserved", "severity": "Warn"},
152+
"131": {"description": "Reserved", "severity": "Warn"},
153+
"132": {"description": "Reserved", "severity": "Warn"},
154+
"133": {"description": "Reserved", "severity": "Warn"},
155+
"134": {"description": "Reserved", "severity": "Warn"},
156+
"135": {"description": "Reserved", "severity": "Warn"},
157+
"136": {"description": "Reserved", "severity": "Warn"},
158+
"137": {"description": "Reserved", "severity": "Warn"},
159+
"138": {"description": "Reserved", "severity": "Warn"},
160+
"139": {"description": "Reserved", "severity": "Warn"},
161+
"140": {"description": "Unrecovered ECC Error", "severity": "Warn"},
162+
"141": {"description": "Reserved", "severity": "Warn"},
163+
"142": {"description": "Reserved", "severity": "Warn"},
164+
"143": {"description": "GPU Initialization Failure", "severity": "Warn"}
165+
}
166+
167+
def check_gpu_xid(self):
168+
status = "Pass"
169+
dmesg_output = subprocess.check_output([self.dmesg_cmd]).decode("utf-8")
170+
if "NVRM: Xid" in dmesg_output:
171+
for XID in self.XID_EC.keys():
172+
logger.debug(f"Checking for GPU Xid {XID} error in dmesg")
173+
174+
matches = re.findall(f"NVRM: Xid \(PCI:(.*?): {XID},", dmesg_output)
175+
tmp_dict = {}
176+
for match in matches:
177+
if match not in tmp_dict:
178+
tmp_dict[match] = 1
179+
else:
180+
tmp_dict[match] = tmp_dict[match] + 1
181+
for x in tmp_dict.keys():
182+
logger.info(f"{XID} : count: {tmp_dict[x]}, {self.XID_EC[XID]['description']} - PCI: {x}")
183+
if not matches:
184+
logger.debug(f"No GPU Xid {XID} error found in dmesg")
185+
if tmp_dict != {}:
186+
if self.XID_EC[XID]['severity'] == "Critical":
187+
status = "Failed"
188+
self.results[XID] = {"results": tmp_dict, "description": self.XID_EC[XID]['description']}
189+
else:
190+
logger.info("Xid Check: Passed")
191+
return {"status": status, "results": self.results}
192+
193+
194+
if __name__ == '__main__':
195+
# Argument parsing
196+
parser = argparse.ArgumentParser(description='Check for GPU Xid errors.')
197+
parser.add_argument('--dmesg_cmd', default='dmesg', help='Dmesg file to check. Default is dmesg.')
198+
args = parser.parse_args()
199+
200+
201+
logger.debug(f"Using dmesg command: {args.dmesg_cmd}")
202+
203+
xc = XidChecker(dmesg_cmd=args.dmesg_cmd)
204+
results = xc.check_gpu_xid()
205+
logger.debug("Status: {}, Results: {}".format(results["status"], results["results"]))

playbooks/roles/metrics-exporter/tasks/custom_metrics.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
- custom_metric_common.py
3838
- shared_logging.py
3939
- rdma_link_flapping.py
40+
- xid_checker.py
4041

4142
- name: Copying custom metric service file
4243
become: true

0 commit comments

Comments
 (0)