Skip to content

Commit 78bcb1a

Browse files
committed
Merge branch 'development' into alex_mem_update
2 parents d02aed1 + 3df4921 commit 78bcb1a

File tree

2 files changed

+118
-22
lines changed

2 files changed

+118
-22
lines changed

nodescraper/plugins/inband/dmesg/dmesg_analyzer.py

Lines changed: 73 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,7 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
9898
event_category=EventCategory.SW_DRIVER,
9999
),
100100
ErrorRegex(
101-
regex=re.compile(
102-
r"(?:[\w\d_-]*)(?:\[[\d.]*\])? (?:general protection fault)|(?:general protection fault.*)"
103-
),
101+
regex=re.compile(r"(?:[\w-]+(?:\[[0-9.]+\])?\s+)?general protection fault[^\n]*"),
104102
message="General protection fault",
105103
event_category=EventCategory.SW_DRIVER,
106104
),
@@ -129,18 +127,19 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
129127
ErrorRegex(
130128
regex=re.compile(
131129
(
132-
r"(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:\s+\[\S+\]\s*(?:retry|no-retry)? page fault.*)"
133-
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
134-
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
135-
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
136-
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
137-
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
138-
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
139-
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
140-
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
141-
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
142-
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
143-
)
130+
r"(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:\s+\[\S+\]\s*(?:retry|no-retry)? page fault[^\n]*)"
131+
r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
132+
r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
133+
r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
134+
r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
135+
r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
136+
r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
137+
r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
138+
r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
139+
r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
140+
r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
141+
),
142+
re.MULTILINE,
144143
),
145144
message="amdgpu Page Fault",
146145
event_category=EventCategory.SW_DRIVER,
@@ -259,7 +258,52 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
259258
event_category=EventCategory.RAS,
260259
),
261260
ErrorRegex(
262-
regex=re.compile(r"Accelerator Check Architecture.*(?:\n.*){0,5}"),
261+
regex=re.compile(
262+
(
263+
r"(Accelerator Check Architecture[^\n]*)"
264+
r"(?:\n[^\n]*){0,10}?"
265+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*entry\[\d+\]\.STATUS=0x[0-9a-fA-F]+)"
266+
r"(?:\n[^\n]*){0,5}?"
267+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*entry\[\d+\]\.ADDR=0x[0-9a-fA-F]+)"
268+
r"(?:\n[^\n]*){0,5}?"
269+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*entry\[\d+\]\.MISC0=0x[0-9a-fA-F]+)"
270+
r"(?:\n[^\n]*){0,5}?"
271+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*entry\[\d+\]\.IPID=0x[0-9a-fA-F]+)"
272+
r"(?:\n[^\n]*){0,5}?"
273+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*entry\[\d+\]\.SYND=0x[0-9a-fA-F]+-?)"
274+
),
275+
re.MULTILINE,
276+
),
277+
message="ACA Error",
278+
event_category=EventCategory.RAS,
279+
),
280+
ErrorRegex(
281+
regex=re.compile(
282+
(
283+
r"(Accelerator Check Architecture[^\n]*)"
284+
r"(?:\n[^\n]*){0,10}?"
285+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*CONTROL=0x[0-9a-fA-F]+)"
286+
r"(?:\n[^\n]*){0,5}?"
287+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*STATUS=0x[0-9a-fA-F]+)"
288+
r"(?:\n[^\n]*){0,5}?"
289+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*ADDR=0x[0-9a-fA-F]+)"
290+
r"(?:\n[^\n]*){0,5}?"
291+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*MISC=0x[0-9a-fA-F]+)"
292+
r"(?:\n[^\n]*){0,5}?"
293+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*CONFIG=0x[0-9a-fA-F]+)"
294+
r"(?:\n[^\n]*){0,5}?"
295+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*IPID=0x[0-9a-fA-F]+)"
296+
r"(?:\n[^\n]*){0,5}?"
297+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*SYND=0x[0-9a-fA-F]+)"
298+
r"(?:\n[^\n]*){0,5}?"
299+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*DESTAT=0x[0-9a-fA-F]+)"
300+
r"(?:\n[^\n]*){0,5}?"
301+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*DEADDR=0x[0-9a-fA-F]+)"
302+
r"(?:\n[^\n]*){0,5}?"
303+
r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*CONTROL_MASK=0x[0-9a-fA-F]+)"
304+
),
305+
re.MULTILINE,
306+
),
263307
message="ACA Error",
264308
event_category=EventCategory.RAS,
265309
),
@@ -369,18 +413,25 @@ def _is_known_error(self, known_err_events: list[Event], unknown_match: str) ->
369413
bool: return True if error is known
370414
"""
371415
for regex_obj in self.ERROR_REGEX:
372-
if regex_obj.regex.search(unknown_match):
373-
return True
416+
try:
417+
if regex_obj.regex.search(unknown_match):
418+
return True
419+
except re.error:
420+
continue
374421

375-
# handle multline matches
376422
for event in known_err_events:
377423
known_match = event.data["match_content"]
378424
if isinstance(known_match, list):
379425
for line in known_match:
380-
if unknown_match in line:
426+
if unknown_match == line or unknown_match in line or line in unknown_match:
381427
return True
382-
elif known_match in unknown_match:
383-
return True
428+
elif isinstance(known_match, str):
429+
if (
430+
unknown_match == known_match
431+
or unknown_match in known_match
432+
or known_match in unknown_match
433+
):
434+
return True
384435
return False
385436

386437
def analyze_data(

test/unit/plugin/test_dmesg_analyzer.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,3 +234,48 @@ def test_lnet_and_lustre_boot_errors_are_warning_events(system_info):
234234
for m in (m1, m2, m3):
235235
ev = by_msg[m]
236236
assert ev.priority == EventPriority.WARNING, f"{m} should be WARNING"
237+
238+
239+
def test_aca(system_info):
240+
aca_data1 = DmesgData(
241+
dmesg_content=(
242+
"kern :err : 2025-01-01T10:17:15,145363-04:00 amdgpu 0000:0c:00.0: amdgpu: [Hardware error] Accelerator Check Architecture events logged\n"
243+
"kern :err : 2025-01-01T10:17:15,145363-04:00 amdgpu 0000:0c:00.0: amdgpu: [Hardware error] aca entry[00].STATUS=0x000000000000000f\n"
244+
"kern :err : 2025-01-01T10:17:15,145363-04:00 amdgpu 0000:0c:00.0: amdgpu: [Hardware error] aca entry[00].ADDR=0x0000000000000000\n"
245+
"kern :err : 2025-01-01T10:17:15,145363-04:00 amdgpu 0000:0c:00.0: amdgpu: [Hardware error] aca entry[00].MISC0=0x0000000000000000\n"
246+
"kern :err : 2025-01-01T10:17:15,145363-04:00 amdgpu 0000:0c:00.0: amdgpu: [Hardware error] aca entry[00].IPID=0x0000000000000000\n"
247+
"kern :err : 2025-01-01T10:17:15,145363-04:00 amdgpu 0000:0c:00.0: amdgpu: [Hardware error] aca entry[00].SYND=0x0000000000000000\n"
248+
)
249+
)
250+
251+
aca_data2 = DmesgData(
252+
dmesg_content=(
253+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: Accelerator Check Architecture events logged\n"
254+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: ACA[01/01].CONTROL=0x000000000000000f\n"
255+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: ACA[01/01].STATUS=0x0000000000000000\n"
256+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: ACA[01/01].ADDR=0x0000000000000000\n"
257+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: ACA[01/01].MISC=0x0000000000000000\n"
258+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: ACA[01/01].CONFIG=0x0000000000000000\n"
259+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: ACA[01/01].IPID=0x0000000000000000\n"
260+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: ACA[01/01].SYND=0x0000000000000000\n"
261+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: ACA[01/01].DESTAT=0x0000000000000000\n"
262+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: ACA[01/01].DEADDR=0x0000000000000000\n"
263+
"kern :err : 2025-01-01T17:53:23,028841-06:00 amdgpu 0000:48:00.0: {1}[Hardware Error]: ACA[01/01].CONTROL_MASK=0x0000000000000000\n"
264+
)
265+
)
266+
267+
analyzer = DmesgAnalyzer(
268+
system_info=system_info,
269+
)
270+
271+
res = analyzer.analyze_data(aca_data1)
272+
assert res.status == ExecutionStatus.ERROR
273+
assert len(res.events) == 1
274+
assert res.events[0].description == "ACA Error"
275+
assert res.events[0].priority == EventPriority.ERROR
276+
277+
res = analyzer.analyze_data(aca_data2)
278+
assert res.status == ExecutionStatus.ERROR
279+
assert len(res.events) == 1
280+
assert res.events[0].description == "ACA Error"
281+
assert res.events[0].priority == EventPriority.ERROR

0 commit comments

Comments
 (0)