Skip to content

Commit 26b689e

Browse files
Merge pull request #64 from amd/alex_amdsmi2
AmdSmiPlugin update: Metric + BadPages
2 parents 965103b + e13a7cc commit 26b689e

File tree

4 files changed

+718
-33
lines changed

4 files changed

+718
-33
lines changed

nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py

Lines changed: 236 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,15 @@
3030
from nodescraper.interfaces import DataAnalyzer
3131
from nodescraper.models import TaskResult
3232

33-
from .amdsmidata import AmdSmiDataModel, AmdSmiStatic, Fw, Partition, Processes
33+
from .amdsmidata import (
34+
AmdSmiDataModel,
35+
AmdSmiMetric,
36+
AmdSmiStatic,
37+
EccData,
38+
Fw,
39+
Partition,
40+
Processes,
41+
)
3442
from .analyzer_args import AmdSmiAnalyzerArgs
3543

3644

@@ -122,6 +130,223 @@ def check_expected_driver_version(
122130
},
123131
)
124132

133+
def check_amdsmi_metric_pcie(
134+
self,
135+
amdsmi_metric_data: list[AmdSmiMetric],
136+
l0_to_recovery_count_error_threshold: int,
137+
l0_to_recovery_count_warning_threshold: int,
138+
):
139+
"""Check PCIe metrics for link errors
140+
141+
Checks for PCIe link width, speed, replays, recoveries, and NAKs.
142+
Expected width/speeds should come from SKU info.
143+
144+
Args:
145+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
146+
l0_to_recovery_count_error_threshold (int): Threshold for error events
147+
l0_to_recovery_count_warning_threshold (int): Threshold for warning events
148+
"""
149+
for metric in amdsmi_metric_data:
150+
pcie_data = metric.pcie
151+
gpu = metric.gpu
152+
153+
if pcie_data.width is not None and pcie_data.width != 16:
154+
self._log_event(
155+
category=EventCategory.IO,
156+
description=f"GPU: {gpu} PCIe width is not x16",
157+
priority=EventPriority.ERROR,
158+
data={"gpu": gpu, "pcie_width": pcie_data.width, "expected": 16},
159+
console_log=True,
160+
)
161+
162+
if pcie_data.speed is not None and pcie_data.speed.value is not None:
163+
try:
164+
speed_val = float(pcie_data.speed.value)
165+
if speed_val != 32.0:
166+
self._log_event(
167+
category=EventCategory.IO,
168+
description=f"GPU: {gpu} PCIe link speed is not Gen5 (32 GT/s)",
169+
priority=EventPriority.ERROR,
170+
data={"gpu": gpu, "pcie_speed": speed_val, "expected": 32.0},
171+
console_log=True,
172+
)
173+
except (ValueError, TypeError):
174+
pass
175+
176+
if pcie_data.replay_count is not None and pcie_data.replay_count > 0:
177+
self._log_event(
178+
category=EventCategory.IO,
179+
description=f"GPU: {gpu} has PCIe replay count: {pcie_data.replay_count}",
180+
priority=EventPriority.WARNING,
181+
data={"gpu": gpu, "replay_count": pcie_data.replay_count},
182+
console_log=True,
183+
)
184+
185+
if (
186+
pcie_data.replay_roll_over_count is not None
187+
and pcie_data.replay_roll_over_count > 0
188+
):
189+
self._log_event(
190+
category=EventCategory.IO,
191+
description=f"GPU: {gpu} has PCIe replay rollover count: {pcie_data.replay_roll_over_count}",
192+
priority=EventPriority.WARNING,
193+
data={"gpu": gpu, "replay_roll_over_count": pcie_data.replay_roll_over_count},
194+
console_log=True,
195+
)
196+
197+
if pcie_data.l0_to_recovery_count is not None:
198+
if pcie_data.l0_to_recovery_count > l0_to_recovery_count_error_threshold:
199+
self._log_event(
200+
category=EventCategory.IO,
201+
description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
202+
priority=EventPriority.ERROR,
203+
data={
204+
"gpu": gpu,
205+
"l0_to_recovery_count": pcie_data.l0_to_recovery_count,
206+
"error_threshold": l0_to_recovery_count_error_threshold,
207+
},
208+
console_log=True,
209+
)
210+
elif pcie_data.l0_to_recovery_count > l0_to_recovery_count_warning_threshold:
211+
self._log_event(
212+
category=EventCategory.IO,
213+
description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
214+
priority=EventPriority.WARNING,
215+
data={
216+
"gpu": gpu,
217+
"l0_to_recovery_count": pcie_data.l0_to_recovery_count,
218+
"warning_threshold": l0_to_recovery_count_warning_threshold,
219+
},
220+
console_log=True,
221+
)
222+
223+
if pcie_data.nak_sent_count is not None and pcie_data.nak_sent_count > 0:
224+
self._log_event(
225+
category=EventCategory.IO,
226+
description=f"GPU: {gpu} has sent {pcie_data.nak_sent_count} PCIe NAKs",
227+
priority=EventPriority.WARNING,
228+
data={"gpu": gpu, "nak_sent_count": pcie_data.nak_sent_count},
229+
console_log=True,
230+
)
231+
232+
if pcie_data.nak_received_count is not None and pcie_data.nak_received_count > 0:
233+
self._log_event(
234+
category=EventCategory.IO,
235+
description=f"GPU: {gpu} has received {pcie_data.nak_received_count} PCIe NAKs",
236+
priority=EventPriority.WARNING,
237+
data={"gpu": gpu, "nak_received_count": pcie_data.nak_received_count},
238+
console_log=True,
239+
)
240+
241+
def check_amdsmi_metric_ecc_totals(self, amdsmi_metric_data: list[AmdSmiMetric]):
242+
"""Check ECC totals for all GPUs
243+
244+
Raises errors for uncorrectable errors, warnings for correctable and deferred.
245+
246+
Args:
247+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
248+
"""
249+
for metric in amdsmi_metric_data:
250+
ecc_totals = metric.ecc
251+
gpu = metric.gpu
252+
253+
ecc_checks: list[tuple[EventPriority, Optional[int], str]] = [
254+
(
255+
EventPriority.WARNING,
256+
ecc_totals.total_correctable_count,
257+
"Total correctable ECC errors",
258+
),
259+
(
260+
EventPriority.ERROR,
261+
ecc_totals.total_uncorrectable_count,
262+
"Total uncorrectable ECC errors",
263+
),
264+
(
265+
EventPriority.WARNING,
266+
ecc_totals.total_deferred_count,
267+
"Total deferred ECC errors",
268+
),
269+
(
270+
EventPriority.WARNING,
271+
ecc_totals.cache_correctable_count,
272+
"Cache correctable ECC errors",
273+
),
274+
(
275+
EventPriority.ERROR,
276+
ecc_totals.cache_uncorrectable_count,
277+
"Cache uncorrectable ECC errors",
278+
),
279+
]
280+
281+
for priority, count, desc in ecc_checks:
282+
if count is not None and count > 0:
283+
self._log_event(
284+
category=EventCategory.RAS,
285+
description=f"GPU: {gpu} has {desc}: {count}",
286+
priority=priority,
287+
data={"gpu": gpu, "error_count": count, "error_type": desc},
288+
console_log=True,
289+
)
290+
291+
def check_amdsmi_metric_ecc(self, amdsmi_metric_data: list[AmdSmiMetric]):
292+
"""Check ECC counts in all blocks for all GPUs
293+
294+
Raises errors for uncorrectable errors, warnings for correctable and deferred.
295+
296+
Args:
297+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
298+
"""
299+
for metric in amdsmi_metric_data:
300+
gpu = metric.gpu
301+
ecc_blocks = metric.ecc_blocks
302+
303+
# Skip if ecc_blocks is a string (e.g., "N/A") or empty
304+
if isinstance(ecc_blocks, str) or not ecc_blocks:
305+
continue
306+
307+
for block_name, ecc_data in ecc_blocks.items():
308+
if not isinstance(ecc_data, EccData):
309+
continue
310+
311+
if ecc_data.correctable_count is not None and ecc_data.correctable_count > 0:
312+
self._log_event(
313+
category=EventCategory.RAS,
314+
description=f"GPU: {gpu} has correctable ECC errors in block {block_name}",
315+
priority=EventPriority.WARNING,
316+
data={
317+
"gpu": gpu,
318+
"block": block_name,
319+
"correctable_count": ecc_data.correctable_count,
320+
},
321+
console_log=True,
322+
)
323+
324+
if ecc_data.uncorrectable_count is not None and ecc_data.uncorrectable_count > 0:
325+
self._log_event(
326+
category=EventCategory.RAS,
327+
description=f"GPU: {gpu} has uncorrectable ECC errors in block {block_name}",
328+
priority=EventPriority.ERROR,
329+
data={
330+
"gpu": gpu,
331+
"block": block_name,
332+
"uncorrectable_count": ecc_data.uncorrectable_count,
333+
},
334+
console_log=True,
335+
)
336+
337+
if ecc_data.deferred_count is not None and ecc_data.deferred_count > 0:
338+
self._log_event(
339+
category=EventCategory.RAS,
340+
description=f"GPU: {gpu} has deferred ECC errors in block {block_name}",
341+
priority=EventPriority.WARNING,
342+
data={
343+
"gpu": gpu,
344+
"block": block_name,
345+
"deferred_count": ecc_data.deferred_count,
346+
},
347+
console_log=True,
348+
)
349+
125350
def expected_gpu_processes(
126351
self, processes_data: Optional[list[Processes]], max_num_processes: int
127352
):
@@ -398,8 +623,6 @@ def check_expected_memory_partition_mode(
398623
}
399624
)
400625

401-
# accelerator currently not avaialbe in API
402-
403626
if bad_memory_partition_mode_gpus:
404627
self._log_event(
405628
category=EventCategory.PLATFORM,
@@ -429,6 +652,16 @@ def analyze_data(
429652
if args is None:
430653
args = AmdSmiAnalyzerArgs()
431654

655+
if data.metric is not None and len(data.metric) > 0:
656+
if args.l0_to_recovery_count_error_threshold is not None:
657+
self.check_amdsmi_metric_pcie(
658+
data.metric,
659+
args.l0_to_recovery_count_error_threshold,
660+
args.l0_to_recovery_count_warning_threshold or 1,
661+
)
662+
self.check_amdsmi_metric_ecc_totals(data.metric)
663+
self.check_amdsmi_metric_ecc(data.metric)
664+
432665
if args.expected_gpu_processes:
433666
self.expected_gpu_processes(data.process, args.expected_gpu_processes)
434667

nodescraper/plugins/inband/amdsmi/amdsmi_collector.py

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def _run_amd_smi(self, cmd: str) -> Optional[str]:
106106
or "User is missing the following required groups" in cmd_ret.stdout
107107
)
108108

109-
# Check for known amd-smi internal bugs
109+
# Check for known amd-smi internal errors
110110
is_amdsmi_internal_error = any(
111111
pattern in cmd_ret.stderr for pattern in ["KeyError:", "AttributeError:", "IndexError:"]
112112
)
@@ -183,19 +183,50 @@ def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]:
183183
cmd_ret = self._run_amd_smi(cmd)
184184
if cmd_ret:
185185
try:
186+
# Try to parse as single JSON first
186187
return json.loads(cmd_ret)
187188
except json.JSONDecodeError as e:
188-
self._log_event(
189-
category=EventCategory.APPLICATION,
190-
description=f"Error parsing command: `{cmd}` json data",
191-
data={
192-
"cmd": cmd,
193-
"exception": get_exception_traceback(e),
194-
},
195-
priority=EventPriority.ERROR,
196-
console_log=True,
197-
)
198-
return None
189+
# try to extract and parse multiple JSON objects
190+
try:
191+
json_objects = []
192+
decoder = json.JSONDecoder()
193+
idx = 0
194+
cmd_ret_stripped = cmd_ret.strip()
195+
196+
while idx < len(cmd_ret_stripped):
197+
while idx < len(cmd_ret_stripped) and cmd_ret_stripped[idx].isspace():
198+
idx += 1
199+
200+
if idx >= len(cmd_ret_stripped):
201+
break
202+
203+
if cmd_ret_stripped[idx] not in ["{", "["]:
204+
break
205+
206+
try:
207+
obj, end_idx = decoder.raw_decode(cmd_ret_stripped, idx)
208+
json_objects.append(obj)
209+
idx = end_idx
210+
except json.JSONDecodeError:
211+
break
212+
213+
if json_objects:
214+
return json_objects if len(json_objects) > 1 else json_objects[0]
215+
else:
216+
raise
217+
218+
except Exception:
219+
self._log_event(
220+
category=EventCategory.APPLICATION,
221+
description=f"Error parsing command: `{cmd}` json data",
222+
data={
223+
"cmd": cmd,
224+
"exception": get_exception_traceback(e),
225+
},
226+
priority=EventPriority.ERROR,
227+
console_log=True,
228+
)
229+
return None
199230
return None
200231

201232
def _to_number(self, v: object) -> Optional[Union[int, float]]:
@@ -498,7 +529,15 @@ def get_partition(self) -> Optional[Partition]:
498529
memparts: list[PartitionMemory] = []
499530
computeparts: list[PartitionCompute] = []
500531

532+
# Flatten multi-JSON results (partition command returns multiple JSON arrays)
533+
flattened_data = []
501534
for item in partition_data:
535+
if isinstance(item, list):
536+
flattened_data.extend(item)
537+
elif isinstance(item, dict):
538+
flattened_data.append(item)
539+
540+
for item in flattened_data:
502541
if not isinstance(item, dict):
503542
continue
504543

0 commit comments

Comments
 (0)