Skip to content

Commit a0b825e

Browse files
committed
updates on missing calls
1 parent 6b3e0bb commit a0b825e

File tree

1 file changed

+236
-1
lines changed

1 file changed

+236
-1
lines changed

nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py

Lines changed: 236 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,15 @@
3030
from nodescraper.interfaces import DataAnalyzer
3131
from nodescraper.models import TaskResult
3232

33-
from .amdsmidata import AmdSmiDataModel, AmdSmiStatic, Fw, Partition, Processes
33+
from .amdsmidata import (
34+
AmdSmiDataModel,
35+
AmdSmiMetric,
36+
AmdSmiStatic,
37+
EccData,
38+
Fw,
39+
Partition,
40+
Processes,
41+
)
3442
from .analyzer_args import AmdSmiAnalyzerArgs
3543

3644

@@ -122,6 +130,223 @@ def check_expected_driver_version(
122130
},
123131
)
124132

133+
def check_amdsmi_metric_pcie(
134+
self,
135+
amdsmi_metric_data: list[AmdSmiMetric],
136+
l0_to_recovery_count_error_threshold: int,
137+
l0_to_recovery_count_warning_threshold: int,
138+
):
139+
"""Check PCIe metrics for link errors
140+
141+
Checks for PCIe link width, speed, replays, recoveries, and NAKs.
142+
Expected width/speeds should come from SKU info.
143+
144+
Args:
145+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
146+
l0_to_recovery_count_error_threshold (int): Threshold for error events
147+
l0_to_recovery_count_warning_threshold (int): Threshold for warning events
148+
"""
149+
for metric in amdsmi_metric_data:
150+
pcie_data = metric.pcie
151+
gpu = metric.gpu
152+
153+
if pcie_data.width is not None and pcie_data.width != 16:
154+
self._log_event(
155+
category=EventCategory.IO,
156+
description=f"GPU: {gpu} PCIe width is not x16",
157+
priority=EventPriority.ERROR,
158+
data={"gpu": gpu, "pcie_width": pcie_data.width, "expected": 16},
159+
console_log=True,
160+
)
161+
162+
if pcie_data.speed is not None and pcie_data.speed.value is not None:
163+
try:
164+
speed_val = float(pcie_data.speed.value)
165+
if speed_val != 32.0:
166+
self._log_event(
167+
category=EventCategory.IO,
168+
description=f"GPU: {gpu} PCIe link speed is not Gen5 (32 GT/s)",
169+
priority=EventPriority.ERROR,
170+
data={"gpu": gpu, "pcie_speed": speed_val, "expected": 32.0},
171+
console_log=True,
172+
)
173+
except (ValueError, TypeError):
174+
pass
175+
176+
if pcie_data.replay_count is not None and pcie_data.replay_count > 0:
177+
self._log_event(
178+
category=EventCategory.IO,
179+
description=f"GPU: {gpu} has PCIe replay count: {pcie_data.replay_count}",
180+
priority=EventPriority.WARNING,
181+
data={"gpu": gpu, "replay_count": pcie_data.replay_count},
182+
console_log=True,
183+
)
184+
185+
if (
186+
pcie_data.replay_roll_over_count is not None
187+
and pcie_data.replay_roll_over_count > 0
188+
):
189+
self._log_event(
190+
category=EventCategory.IO,
191+
description=f"GPU: {gpu} has PCIe replay rollover count: {pcie_data.replay_roll_over_count}",
192+
priority=EventPriority.WARNING,
193+
data={"gpu": gpu, "replay_roll_over_count": pcie_data.replay_roll_over_count},
194+
console_log=True,
195+
)
196+
197+
if pcie_data.l0_to_recovery_count is not None:
198+
if pcie_data.l0_to_recovery_count > l0_to_recovery_count_error_threshold:
199+
self._log_event(
200+
category=EventCategory.IO,
201+
description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
202+
priority=EventPriority.ERROR,
203+
data={
204+
"gpu": gpu,
205+
"l0_to_recovery_count": pcie_data.l0_to_recovery_count,
206+
"error_threshold": l0_to_recovery_count_error_threshold,
207+
},
208+
console_log=True,
209+
)
210+
elif pcie_data.l0_to_recovery_count > l0_to_recovery_count_warning_threshold:
211+
self._log_event(
212+
category=EventCategory.IO,
213+
description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
214+
priority=EventPriority.WARNING,
215+
data={
216+
"gpu": gpu,
217+
"l0_to_recovery_count": pcie_data.l0_to_recovery_count,
218+
"warning_threshold": l0_to_recovery_count_warning_threshold,
219+
},
220+
console_log=True,
221+
)
222+
223+
if pcie_data.nak_sent_count is not None and pcie_data.nak_sent_count > 0:
224+
self._log_event(
225+
category=EventCategory.IO,
226+
description=f"GPU: {gpu} has sent {pcie_data.nak_sent_count} PCIe NAKs",
227+
priority=EventPriority.WARNING,
228+
data={"gpu": gpu, "nak_sent_count": pcie_data.nak_sent_count},
229+
console_log=True,
230+
)
231+
232+
if pcie_data.nak_received_count is not None and pcie_data.nak_received_count > 0:
233+
self._log_event(
234+
category=EventCategory.IO,
235+
description=f"GPU: {gpu} has received {pcie_data.nak_received_count} PCIe NAKs",
236+
priority=EventPriority.WARNING,
237+
data={"gpu": gpu, "nak_received_count": pcie_data.nak_received_count},
238+
console_log=True,
239+
)
240+
241+
def check_amdsmi_metric_ecc_totals(self, amdsmi_metric_data: list[AmdSmiMetric]):
242+
"""Check ECC totals for all GPUs
243+
244+
Raises errors for uncorrectable errors, warnings for correctable and deferred.
245+
246+
Args:
247+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
248+
"""
249+
for metric in amdsmi_metric_data:
250+
ecc_totals = metric.ecc
251+
gpu = metric.gpu
252+
253+
ecc_checks: list[tuple[EventPriority, Optional[int], str]] = [
254+
(
255+
EventPriority.WARNING,
256+
ecc_totals.total_correctable_count,
257+
"Total correctable ECC errors",
258+
),
259+
(
260+
EventPriority.ERROR,
261+
ecc_totals.total_uncorrectable_count,
262+
"Total uncorrectable ECC errors",
263+
),
264+
(
265+
EventPriority.WARNING,
266+
ecc_totals.total_deferred_count,
267+
"Total deferred ECC errors",
268+
),
269+
(
270+
EventPriority.WARNING,
271+
ecc_totals.cache_correctable_count,
272+
"Cache correctable ECC errors",
273+
),
274+
(
275+
EventPriority.ERROR,
276+
ecc_totals.cache_uncorrectable_count,
277+
"Cache uncorrectable ECC errors",
278+
),
279+
]
280+
281+
for priority, count, desc in ecc_checks:
282+
if count is not None and count > 0:
283+
self._log_event(
284+
category=EventCategory.RAS,
285+
description=f"GPU: {gpu} has {desc}: {count}",
286+
priority=priority,
287+
data={"gpu": gpu, "error_count": count, "error_type": desc},
288+
console_log=True,
289+
)
290+
291+
def check_amdsmi_metric_ecc(self, amdsmi_metric_data: list[AmdSmiMetric]):
292+
"""Check ECC counts in all blocks for all GPUs
293+
294+
Raises errors for uncorrectable errors, warnings for correctable and deferred.
295+
296+
Args:
297+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
298+
"""
299+
for metric in amdsmi_metric_data:
300+
gpu = metric.gpu
301+
ecc_blocks = metric.ecc_blocks
302+
303+
# Skip if ecc_blocks is a string (e.g., "N/A") or empty
304+
if isinstance(ecc_blocks, str) or not ecc_blocks:
305+
continue
306+
307+
for block_name, ecc_data in ecc_blocks.items():
308+
if not isinstance(ecc_data, EccData):
309+
continue
310+
311+
if ecc_data.correctable_count is not None and ecc_data.correctable_count > 0:
312+
self._log_event(
313+
category=EventCategory.RAS,
314+
description=f"GPU: {gpu} has correctable ECC errors in block {block_name}",
315+
priority=EventPriority.WARNING,
316+
data={
317+
"gpu": gpu,
318+
"block": block_name,
319+
"correctable_count": ecc_data.correctable_count,
320+
},
321+
console_log=True,
322+
)
323+
324+
if ecc_data.uncorrectable_count is not None and ecc_data.uncorrectable_count > 0:
325+
self._log_event(
326+
category=EventCategory.RAS,
327+
description=f"GPU: {gpu} has uncorrectable ECC errors in block {block_name}",
328+
priority=EventPriority.ERROR,
329+
data={
330+
"gpu": gpu,
331+
"block": block_name,
332+
"uncorrectable_count": ecc_data.uncorrectable_count,
333+
},
334+
console_log=True,
335+
)
336+
337+
if ecc_data.deferred_count is not None and ecc_data.deferred_count > 0:
338+
self._log_event(
339+
category=EventCategory.RAS,
340+
description=f"GPU: {gpu} has deferred ECC errors in block {block_name}",
341+
priority=EventPriority.WARNING,
342+
data={
343+
"gpu": gpu,
344+
"block": block_name,
345+
"deferred_count": ecc_data.deferred_count,
346+
},
347+
console_log=True,
348+
)
349+
125350
def expected_gpu_processes(
126351
self, processes_data: Optional[list[Processes]], max_num_processes: int
127352
):
@@ -427,6 +652,16 @@ def analyze_data(
427652
if args is None:
428653
args = AmdSmiAnalyzerArgs()
429654

655+
if data.metric is not None and len(data.metric) > 0:
656+
if args.l0_to_recovery_count_error_threshold is not None:
657+
self.check_amdsmi_metric_pcie(
658+
data.metric,
659+
args.l0_to_recovery_count_error_threshold,
660+
args.l0_to_recovery_count_warning_threshold or 1,
661+
)
662+
self.check_amdsmi_metric_ecc_totals(data.metric)
663+
self.check_amdsmi_metric_ecc(data.metric)
664+
430665
if args.expected_gpu_processes:
431666
self.expected_gpu_processes(data.process, args.expected_gpu_processes)
432667

0 commit comments

Comments
 (0)