Skip to content

Commit c4f0d20

Browse files
Merge pull request #34 from amd/alex_dmesg
Dmesg analyzer update
2 parents 752247a + 4463430 commit c4f0d20

File tree

3 files changed

+59
-2
lines changed

3 files changed

+59
-2
lines changed

nodescraper/models/event.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class Event(BaseModel):
5050
timestamp: datetime.datetime = Field(
5151
default_factory=lambda: datetime.datetime.now(datetime.timezone.utc)
5252
)
53-
reporter: str = "ERROR_SCRAPER"
53+
reporter: str = "NODE_SCRAPER"
5454
category: str
5555
description: str
5656
data: dict = Field(default_factory=dict)

nodescraper/plugins/inband/dmesg/dmesg_analyzer.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
5353
event_category=EventCategory.SW_DRIVER,
5454
),
5555
ErrorRegex(
56-
regex=re.compile(r"[Kk]ernel panic.*"),
56+
regex=re.compile(r"\bkernel panic\b.*", re.IGNORECASE),
5757
message="Kernel Panic",
5858
event_category=EventCategory.SW_DRIVER,
5959
),
@@ -294,6 +294,33 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
294294
event_category=EventCategory.SW_DRIVER,
295295
event_priority=EventPriority.WARNING,
296296
),
297+
ErrorRegex(
298+
regex=re.compile(
299+
r"(?:\[[^\]]+\]\s*)?LNetError:.*ko2iblnd:\s*No matching interfaces",
300+
re.IGNORECASE,
301+
),
302+
message="LNet: ko2iblnd has no matching interfaces",
303+
event_category=EventCategory.IO,
304+
event_priority=EventPriority.WARNING,
305+
),
306+
ErrorRegex(
307+
regex=re.compile(
308+
r"(?:\[[^\]]+\]\s*)?LNetError:\s*.*Error\s*-?\d+\s+starting up LNI\s+\w+",
309+
re.IGNORECASE,
310+
),
311+
message="LNet: Error starting up LNI",
312+
event_category=EventCategory.IO,
313+
event_priority=EventPriority.WARNING,
314+
),
315+
ErrorRegex(
316+
regex=re.compile(
317+
r"LustreError:.*ptlrpc_init_portals\(\).*network initiali[sz]ation failed",
318+
re.IGNORECASE,
319+
),
320+
message="Lustre: network initialisation failed",
321+
event_category=EventCategory.IO,
322+
event_priority=EventPriority.WARNING,
323+
),
297324
]
298325

299326
@classmethod

test/unit/plugin/test_dmesg_analyzer.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,3 +204,33 @@ def test_page_fault(system_info):
204204
for event in res.events:
205205
assert event.priority == EventPriority.ERROR
206206
assert event.description == "amdgpu Page Fault"
207+
208+
209+
def test_lnet_and_lustre_boot_errors_are_warning_events(system_info):
210+
dmesg_log = "\n".join(
211+
[
212+
"[ 548.063411] LNetError: 2719:0:(o2iblnd.c:3327:kiblnd_startup()) ko2iblnd: No matching interfaces",
213+
"[ 548.073737] LNetError: 105-4: Error -100 starting up LNI o2ib",
214+
"[Wed Jun 25 17:19:52 2025] LustreError: 2719:0:(events.c:639:ptlrpc_init_portals()) network initialisation failed",
215+
]
216+
)
217+
218+
analyzer = DmesgAnalyzer(
219+
system_info=system_info,
220+
)
221+
data = DmesgData(dmesg_content=dmesg_log)
222+
result = analyzer.analyze_data(data, DmesgAnalyzerArgs())
223+
224+
by_msg = {e.description: e for e in result.events}
225+
226+
m1 = "LNet: ko2iblnd has no matching interfaces"
227+
m2 = "LNet: Error starting up LNI"
228+
m3 = "Lustre: network initialisation failed"
229+
230+
assert m1 in by_msg, f"Missing event: {m1}"
231+
assert m2 in by_msg, f"Missing event: {m2}"
232+
assert m3 in by_msg, f"Missing event: {m3}"
233+
234+
for m in (m1, m2, m3):
235+
ev = by_msg[m]
236+
assert ev.priority == EventPriority.WARNING, f"{m} should be WARNING"

0 commit comments

Comments
 (0)