Skip to content

Commit cbee075

Browse files
Merge branch 'development' into alex_devenum_update
2 parents 099b58d + 31fee81 commit cbee075

File tree

16 files changed

+5018
-50
lines changed

16 files changed

+5018
-50
lines changed

nodescraper/models/taskresult.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -103,28 +103,40 @@ def duration(self) -> Optional[str]:
103103
return duration
104104

105105
def _get_event_summary(self) -> str:
106-
"""Get summary string for artifacts
106+
"""Get summary string for events
107107
108108
Returns:
109-
str: artifact summary
109+
str: event summary with counts and descriptions
110110
"""
111-
error_count = 0
112-
warning_count = 0
111+
error_msg_counts: dict[str, int] = {}
112+
warning_msg_counts: dict[str, int] = {}
113113

114114
for event in self.events:
115115
if event.priority == EventPriority.WARNING:
116-
warning_count += 1
116+
warning_msg_counts[event.description] = (
117+
warning_msg_counts.get(event.description, 0) + 1
118+
)
117119
elif event.priority >= EventPriority.ERROR:
118-
error_count += 1
119-
120-
summary_list = []
121-
122-
if warning_count:
123-
summary_list.append(f"{warning_count} warnings")
124-
if error_count:
125-
summary_list.append(f"{error_count} errors")
126-
127-
return "|".join(summary_list)
120+
error_msg_counts[event.description] = error_msg_counts.get(event.description, 0) + 1
121+
122+
summary_parts = []
123+
124+
if warning_msg_counts:
125+
total_warnings = sum(warning_msg_counts.values())
126+
warning_details = [
127+
f"{msg} (x{count})" if count > 1 else msg
128+
for msg, count in warning_msg_counts.items()
129+
]
130+
summary_parts.append(f"{total_warnings} warnings: {', '.join(warning_details)}")
131+
132+
if error_msg_counts:
133+
total_errors = sum(error_msg_counts.values())
134+
error_details = [
135+
f"{msg} (x{count})" if count > 1 else msg for msg, count in error_msg_counts.items()
136+
]
137+
summary_parts.append(f"{total_errors} errors: {', '.join(error_details)}")
138+
139+
return "; ".join(summary_parts)
128140

129141
def _update_status(self) -> None:
130142
"""Update overall status based on event priority"""

nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py

Lines changed: 236 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,15 @@
3030
from nodescraper.interfaces import DataAnalyzer
3131
from nodescraper.models import TaskResult
3232

33-
from .amdsmidata import AmdSmiDataModel, AmdSmiStatic, Fw, Partition, Processes
33+
from .amdsmidata import (
34+
AmdSmiDataModel,
35+
AmdSmiMetric,
36+
AmdSmiStatic,
37+
EccData,
38+
Fw,
39+
Partition,
40+
Processes,
41+
)
3442
from .analyzer_args import AmdSmiAnalyzerArgs
3543

3644

@@ -122,6 +130,223 @@ def check_expected_driver_version(
122130
},
123131
)
124132

133+
def check_amdsmi_metric_pcie(
134+
self,
135+
amdsmi_metric_data: list[AmdSmiMetric],
136+
l0_to_recovery_count_error_threshold: int,
137+
l0_to_recovery_count_warning_threshold: int,
138+
):
139+
"""Check PCIe metrics for link errors
140+
141+
Checks for PCIe link width, speed, replays, recoveries, and NAKs.
142+
Expected width/speeds should come from SKU info.
143+
144+
Args:
145+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
146+
l0_to_recovery_count_error_threshold (int): Threshold for error events
147+
l0_to_recovery_count_warning_threshold (int): Threshold for warning events
148+
"""
149+
for metric in amdsmi_metric_data:
150+
pcie_data = metric.pcie
151+
gpu = metric.gpu
152+
153+
if pcie_data.width is not None and pcie_data.width != 16:
154+
self._log_event(
155+
category=EventCategory.IO,
156+
description=f"GPU: {gpu} PCIe width is not x16",
157+
priority=EventPriority.ERROR,
158+
data={"gpu": gpu, "pcie_width": pcie_data.width, "expected": 16},
159+
console_log=True,
160+
)
161+
162+
if pcie_data.speed is not None and pcie_data.speed.value is not None:
163+
try:
164+
speed_val = float(pcie_data.speed.value)
165+
if speed_val != 32.0:
166+
self._log_event(
167+
category=EventCategory.IO,
168+
description=f"GPU: {gpu} PCIe link speed is not Gen5 (32 GT/s)",
169+
priority=EventPriority.ERROR,
170+
data={"gpu": gpu, "pcie_speed": speed_val, "expected": 32.0},
171+
console_log=True,
172+
)
173+
except (ValueError, TypeError):
174+
pass
175+
176+
if pcie_data.replay_count is not None and pcie_data.replay_count > 0:
177+
self._log_event(
178+
category=EventCategory.IO,
179+
description=f"GPU: {gpu} has PCIe replay count: {pcie_data.replay_count}",
180+
priority=EventPriority.WARNING,
181+
data={"gpu": gpu, "replay_count": pcie_data.replay_count},
182+
console_log=True,
183+
)
184+
185+
if (
186+
pcie_data.replay_roll_over_count is not None
187+
and pcie_data.replay_roll_over_count > 0
188+
):
189+
self._log_event(
190+
category=EventCategory.IO,
191+
description=f"GPU: {gpu} has PCIe replay rollover count: {pcie_data.replay_roll_over_count}",
192+
priority=EventPriority.WARNING,
193+
data={"gpu": gpu, "replay_roll_over_count": pcie_data.replay_roll_over_count},
194+
console_log=True,
195+
)
196+
197+
if pcie_data.l0_to_recovery_count is not None:
198+
if pcie_data.l0_to_recovery_count > l0_to_recovery_count_error_threshold:
199+
self._log_event(
200+
category=EventCategory.IO,
201+
description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
202+
priority=EventPriority.ERROR,
203+
data={
204+
"gpu": gpu,
205+
"l0_to_recovery_count": pcie_data.l0_to_recovery_count,
206+
"error_threshold": l0_to_recovery_count_error_threshold,
207+
},
208+
console_log=True,
209+
)
210+
elif pcie_data.l0_to_recovery_count > l0_to_recovery_count_warning_threshold:
211+
self._log_event(
212+
category=EventCategory.IO,
213+
description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
214+
priority=EventPriority.WARNING,
215+
data={
216+
"gpu": gpu,
217+
"l0_to_recovery_count": pcie_data.l0_to_recovery_count,
218+
"warning_threshold": l0_to_recovery_count_warning_threshold,
219+
},
220+
console_log=True,
221+
)
222+
223+
if pcie_data.nak_sent_count is not None and pcie_data.nak_sent_count > 0:
224+
self._log_event(
225+
category=EventCategory.IO,
226+
description=f"GPU: {gpu} has sent {pcie_data.nak_sent_count} PCIe NAKs",
227+
priority=EventPriority.WARNING,
228+
data={"gpu": gpu, "nak_sent_count": pcie_data.nak_sent_count},
229+
console_log=True,
230+
)
231+
232+
if pcie_data.nak_received_count is not None and pcie_data.nak_received_count > 0:
233+
self._log_event(
234+
category=EventCategory.IO,
235+
description=f"GPU: {gpu} has received {pcie_data.nak_received_count} PCIe NAKs",
236+
priority=EventPriority.WARNING,
237+
data={"gpu": gpu, "nak_received_count": pcie_data.nak_received_count},
238+
console_log=True,
239+
)
240+
241+
def check_amdsmi_metric_ecc_totals(self, amdsmi_metric_data: list[AmdSmiMetric]):
242+
"""Check ECC totals for all GPUs
243+
244+
Raises errors for uncorrectable errors, warnings for correctable and deferred.
245+
246+
Args:
247+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
248+
"""
249+
for metric in amdsmi_metric_data:
250+
ecc_totals = metric.ecc
251+
gpu = metric.gpu
252+
253+
ecc_checks: list[tuple[EventPriority, Optional[int], str]] = [
254+
(
255+
EventPriority.WARNING,
256+
ecc_totals.total_correctable_count,
257+
"Total correctable ECC errors",
258+
),
259+
(
260+
EventPriority.ERROR,
261+
ecc_totals.total_uncorrectable_count,
262+
"Total uncorrectable ECC errors",
263+
),
264+
(
265+
EventPriority.WARNING,
266+
ecc_totals.total_deferred_count,
267+
"Total deferred ECC errors",
268+
),
269+
(
270+
EventPriority.WARNING,
271+
ecc_totals.cache_correctable_count,
272+
"Cache correctable ECC errors",
273+
),
274+
(
275+
EventPriority.ERROR,
276+
ecc_totals.cache_uncorrectable_count,
277+
"Cache uncorrectable ECC errors",
278+
),
279+
]
280+
281+
for priority, count, desc in ecc_checks:
282+
if count is not None and count > 0:
283+
self._log_event(
284+
category=EventCategory.RAS,
285+
description=f"GPU: {gpu} has {desc}: {count}",
286+
priority=priority,
287+
data={"gpu": gpu, "error_count": count, "error_type": desc},
288+
console_log=True,
289+
)
290+
291+
def check_amdsmi_metric_ecc(self, amdsmi_metric_data: list[AmdSmiMetric]):
292+
"""Check ECC counts in all blocks for all GPUs
293+
294+
Raises errors for uncorrectable errors, warnings for correctable and deferred.
295+
296+
Args:
297+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
298+
"""
299+
for metric in amdsmi_metric_data:
300+
gpu = metric.gpu
301+
ecc_blocks = metric.ecc_blocks
302+
303+
# Skip if ecc_blocks is a string (e.g., "N/A") or empty
304+
if isinstance(ecc_blocks, str) or not ecc_blocks:
305+
continue
306+
307+
for block_name, ecc_data in ecc_blocks.items():
308+
if not isinstance(ecc_data, EccData):
309+
continue
310+
311+
if ecc_data.correctable_count is not None and ecc_data.correctable_count > 0:
312+
self._log_event(
313+
category=EventCategory.RAS,
314+
description=f"GPU: {gpu} has correctable ECC errors in block {block_name}",
315+
priority=EventPriority.WARNING,
316+
data={
317+
"gpu": gpu,
318+
"block": block_name,
319+
"correctable_count": ecc_data.correctable_count,
320+
},
321+
console_log=True,
322+
)
323+
324+
if ecc_data.uncorrectable_count is not None and ecc_data.uncorrectable_count > 0:
325+
self._log_event(
326+
category=EventCategory.RAS,
327+
description=f"GPU: {gpu} has uncorrectable ECC errors in block {block_name}",
328+
priority=EventPriority.ERROR,
329+
data={
330+
"gpu": gpu,
331+
"block": block_name,
332+
"uncorrectable_count": ecc_data.uncorrectable_count,
333+
},
334+
console_log=True,
335+
)
336+
337+
if ecc_data.deferred_count is not None and ecc_data.deferred_count > 0:
338+
self._log_event(
339+
category=EventCategory.RAS,
340+
description=f"GPU: {gpu} has deferred ECC errors in block {block_name}",
341+
priority=EventPriority.WARNING,
342+
data={
343+
"gpu": gpu,
344+
"block": block_name,
345+
"deferred_count": ecc_data.deferred_count,
346+
},
347+
console_log=True,
348+
)
349+
125350
def expected_gpu_processes(
126351
self, processes_data: Optional[list[Processes]], max_num_processes: int
127352
):
@@ -398,8 +623,6 @@ def check_expected_memory_partition_mode(
398623
}
399624
)
400625

401-
# accelerator currently not avaialbe in API
402-
403626
if bad_memory_partition_mode_gpus:
404627
self._log_event(
405628
category=EventCategory.PLATFORM,
@@ -429,6 +652,16 @@ def analyze_data(
429652
if args is None:
430653
args = AmdSmiAnalyzerArgs()
431654

655+
if data.metric is not None and len(data.metric) > 0:
656+
if args.l0_to_recovery_count_error_threshold is not None:
657+
self.check_amdsmi_metric_pcie(
658+
data.metric,
659+
args.l0_to_recovery_count_error_threshold,
660+
args.l0_to_recovery_count_warning_threshold or 1,
661+
)
662+
self.check_amdsmi_metric_ecc_totals(data.metric)
663+
self.check_amdsmi_metric_ecc(data.metric)
664+
432665
if args.expected_gpu_processes:
433666
self.expected_gpu_processes(data.process, args.expected_gpu_processes)
434667

0 commit comments

Comments
 (0)