Skip to content

Commit d006058

Browse files
committed
update page fault regex
1 parent 1827983 commit d006058

File tree

2 files changed

+65
-10
lines changed

2 files changed

+65
-10
lines changed

nodescraper/plugins/inband/dmesg/dmesg_analyzer.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,17 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
129129
ErrorRegex(
130130
regex=re.compile(
131131
(
132-
r"(amdgpu \d{4}:\d{2}:\d{2}.\d:\s+amdgpu:\s+\[\S+\]\s*(?:retry|no-retry)? page fault.*)"
133-
r"(?:\n.*(amdgpu \d{4}:\d{2}:\d{2}.\d:\s+amdgpu:.*))?"
134-
r"(?:\n.*(amdgpu \d{4}:\d{2}:\d{2}.\d:\s+amdgpu:.*))?"
135-
r"(?:\n.*(amdgpu \d{4}:\d{2}:\d{2}.\d:\s+amdgpu:.*))?"
136-
r"(?:\n.*(amdgpu \d{4}:\d{2}:\d{2}.\d:\s+amdgpu:.*))?"
137-
r"(?:\n.*(amdgpu \d{4}:\d{2}:\d{2}.\d:\s+amdgpu:.*))?"
138-
r"(?:\n.*(amdgpu \d{4}:\d{2}:\d{2}.\d:\s+amdgpu:.*))?"
139-
r"(?:\n.*(amdgpu \d{4}:\d{2}:\d{2}.\d:\s+amdgpu:.*))?"
140-
r"(?:\n.*(amdgpu \d{4}:\d{2}:\d{2}.\d:\s+amdgpu:.*))?"
141-
r"(?:\n.*(amdgpu \d{4}:\d{2}:\d{2}.\d:\s+amdgpu:.*))?"
132+
r"(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:\s+\[\S+\]\s*(?:retry|no-retry)? page fault.*)"
133+
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
134+
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
135+
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
136+
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
137+
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
138+
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
139+
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
140+
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
141+
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
142+
r"(?:\n.*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:.*))?"
142143
)
143144
),
144145
message="amdgpu Page Fault",

test/unit/plugin/test_dmesg_analyzer.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
###############################################################################
2626
import datetime
2727

28+
from nodescraper.enums.eventpriority import EventPriority
2829
from nodescraper.enums.executionstatus import ExecutionStatus
2930
from nodescraper.plugins.inband.dmesg.analyzer_args import DmesgAnalyzerArgs
3031
from nodescraper.plugins.inband.dmesg.dmesg_analyzer import DmesgAnalyzer
@@ -150,3 +151,56 @@ def test_exclude_category(system_info):
150151
assert len(res.events) == 4
151152
for event in res.events:
152153
assert event.category != "RAS"
154+
155+
156+
def test_page_fault(system_info):
157+
dmesg_data = DmesgData(
158+
dmesg_content=(
159+
"kern :err : 2025-01-01T00:00:00,000000+00:00 amdgpu 0000:03:00.0: amdgpu: [mmhub0] no-retry page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)\n"
160+
"kern :err : 2025-01-01T00:00:01,000000+00:00 amdgpu 0000:03:00.0: amdgpu: test example 123\n"
161+
"kern :err : 2025-01-01T00:00:02,000000+00:00 amdgpu 0000:03:00.0: amdgpu: test example 123\n"
162+
"kern :err : 2025-01-01T00:00:03,000000+00:00 amdgpu 0000:03:00.0: amdgpu: VM_L2_PROTECTION_FAULT_STATUS:0x00000000\n"
163+
"kern :err : 2025-01-01T00:00:04,000000+00:00 amdgpu 0000:03:00.0: amdgpu: Faulty UTCL2 client ID: ABC123 (0x000)\n"
164+
"kern :err : 2025-01-01T00:00:05,000000+00:00 amdgpu 0000:03:00.0: amdgpu: MORE_FAULTS: 0x0\n"
165+
"kern :err : 2025-01-01T00:00:06,000000+00:00 amdgpu 0000:03:00.0: amdgpu: WALKER_ERROR: 0x0\n"
166+
"kern :err : 2025-01-01T00:00:07,000000+00:00 amdgpu 0000:03:00.0: amdgpu: PERMISSION_FAULTS: 0x0\n"
167+
"kern :err : 2025-01-01T00:00:08,000000+00:00 amdgpu 0000:03:00.0: amdgpu: MAPPING_ERROR: 0x0\n"
168+
"kern :err : 2025-01-01T00:00:09,000000+00:00 amdgpu 0000:03:00.0: amdgpu: RW: 0x0\n"
169+
"kern :info : 2025-01-01T00:00:10,000000+00:00 TEST TEST\n"
170+
"kern :err : 2025-01-01T00:00:11,000000+00:00 amdgpu 0000:03:00.0: amdgpu: [gfxhub0] retry page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)\n"
171+
"kern :err : 2025-01-01T00:00:12,000000+00:00 amdgpu 0000:03:00.0: amdgpu: test example 123\n"
172+
"kern :err : 2025-01-01T00:00:13,000000+00:00 amdgpu 0000:03:00.0: amdgpu: test example 123\n"
173+
"kern :err : 2025-01-01T00:00:14,000000+00:00 amdgpu 0000:03:00.0: amdgpu: VM_L2_PROTECTION_FAULT_STATUS:0x00000000\n"
174+
"kern :err : 2025-01-01T00:00:15,000000+00:00 amdgpu 0000:03:00.0: amdgpu: Faulty UTCL2 client ID: ABC123 (0x000)\n"
175+
"kern :err : 2025-01-01T00:00:16,000000+00:00 amdgpu 0000:03:00.0: amdgpu: MORE_FAULTS: 0x0\n"
176+
"kern :err : 2025-01-01T00:00:17,000000+00:00 amdgpu 0000:03:00.0: amdgpu: WALKER_ERROR: 0x0\n"
177+
"kern :err : 2025-01-01T00:00:18,000000+00:00 amdgpu 0000:03:00.0: amdgpu: PERMISSION_FAULTS: 0x0\n"
178+
"kern :err : 2025-01-01T00:00:19,000000+00:00 amdgpu 0000:03:00.0: amdgpu: MAPPING_ERROR: 0x0\n"
179+
"kern :err : 2025-01-01T00:00:20,000000+00:00 amdgpu 0000:03:00.0: amdgpu: RW: 0x0\n"
180+
"kern :info : 2025-01-01T00:00:21,000000+00:00 TEST TEST\n"
181+
"kern :err : 2025-01-01T00:00:22,000000+00:00 amdgpu 0003:02:00.0: amdgpu: [gfxhub0] retry page fault (swpekfwpo\n"
182+
"kern :info : 2025-01-01T00:00:23,000000+00:00 TEST TEST\n"
183+
"kern :err : 2025-01-01T00:00:24,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: [mmhub0] no-retry page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)\n"
184+
"kern :err : 2025-01-01T00:00:25,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: test example 123\n"
185+
"kern :err : 2025-01-01T00:00:26,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: test example 123\n"
186+
"kern :err : 2025-01-01T00:00:27,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: test example 123\n"
187+
"kern :err : 2025-01-01T00:00:28,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: VM_L2_PROTECTION_FAULT_STATUS:0x00000000\n"
188+
"kern :err : 2025-01-01T00:00:29,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: Faulty UTCL2 client ID: ABC123 (0x000)\n"
189+
"kern :err : 2025-01-01T00:00:30,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: MORE_FAULTS: 0x0\n"
190+
"kern :err : 2025-01-01T00:00:31,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: WALKER_ERROR: 0x0\n"
191+
"kern :err : 2025-01-01T00:00:32,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: PERMISSION_FAULTS: 0x0\n"
192+
"kern :err : 2025-01-01T00:00:33,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: MAPPING_ERROR: 0x0\n"
193+
"kern :err : 2025-01-01T00:00:34,000000+00:00 amdgpu 0000:f5:00.0: amdgpu: RW: 0x0\n"
194+
)
195+
)
196+
197+
analyzer = DmesgAnalyzer(
198+
system_info=system_info,
199+
)
200+
201+
res = analyzer.analyze_data(dmesg_data)
202+
assert res.status == ExecutionStatus.ERROR
203+
assert len(res.events) == 4
204+
for event in res.events:
205+
assert event.priority == EventPriority.ERROR
206+
assert event.description == "amdgpu Page Fault"

0 commit comments

Comments
 (0)