Skip to content

Commit 315c7d4

Browse files
committed
added more analyzer parts
1 parent f4a4064 commit 315c7d4

File tree

3 files changed

+228
-19
lines changed

3 files changed

+228
-19
lines changed

nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py

Lines changed: 208 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from nodescraper.interfaces import DataAnalyzer
3030
from nodescraper.models import TaskResult
3131

32-
from .amdsmidata import AmdSmiDataModel, Fw, Partition, Processes
32+
from .amdsmidata import AmdSmiDataModel, AmdSmiStatic, Fw, Partition, Processes
3333
from .analyzer_args import AmdSmiAnalyzerArgs
3434

3535

@@ -41,6 +41,75 @@ class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]):
4141
L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3
4242
L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1
4343

44+
def check_expected_max_power(
45+
self,
46+
amdsmi_static_data: list[AmdSmiStatic],
47+
expected_max_power: int,
48+
):
49+
"""Check the max power for all GPUs. If the max power is not as expected, log an error event"""
50+
incorrect_max_power_gpus: dict[int, int | str | float] = {}
51+
for gpu in amdsmi_static_data:
52+
if gpu.limit is None or gpu.limit.max_power is None:
53+
self._log_event(
54+
category=EventCategory.PLATFORM,
55+
description=f"GPU: {gpu.gpu} has no max power limit set",
56+
priority=EventPriority.WARNING,
57+
data={"gpu": gpu.gpu},
58+
)
59+
continue
60+
max_power_value = gpu.limit.max_power.value
61+
try:
62+
max_power_float = float(max_power_value)
63+
except ValueError:
64+
self._log_event(
65+
category=EventCategory.PLATFORM,
66+
description=f"GPU: {gpu.gpu} has an invalid max power limit",
67+
priority=EventPriority.ERROR,
68+
data={
69+
"gpu": gpu.gpu,
70+
"max_power_value": max_power_value,
71+
},
72+
)
73+
continue
74+
if max_power_float != expected_max_power:
75+
incorrect_max_power_gpus[gpu.gpu] = max_power_float
76+
if incorrect_max_power_gpus:
77+
self._log_event(
78+
category=EventCategory.PLATFORM,
79+
description="Max power mismatch",
80+
priority=EventPriority.ERROR,
81+
data={
82+
"gpus": list(incorrect_max_power_gpus.keys()),
83+
"max_power_values": incorrect_max_power_gpus,
84+
"expected_max_power": expected_max_power,
85+
},
86+
)
87+
88+
def check_expected_driver_version(
89+
self,
90+
amdsmi_static_data: list[AmdSmiStatic],
91+
expected_driver_version: str,
92+
):
93+
bad_driver_gpus = []
94+
for gpu in amdsmi_static_data:
95+
if gpu.driver.version != expected_driver_version:
96+
bad_driver_gpus.append(gpu.gpu)
97+
if bad_driver_gpus:
98+
self._log_event(
99+
category=EventCategory.PLATFORM,
100+
description="Driver Version Mismatch",
101+
priority=EventPriority.ERROR,
102+
data={
103+
"gpus": bad_driver_gpus,
104+
"driver_version": {
105+
gpu.gpu: gpu.driver.version
106+
for gpu in amdsmi_static_data
107+
if gpu.gpu in bad_driver_gpus
108+
},
109+
"expected_driver_version": expected_driver_version,
110+
},
111+
)
112+
44113
def expected_gpu_processes(
45114
self, processes_data: list[Processes] | None, max_num_processes: int
46115
):
@@ -63,7 +132,7 @@ def expected_gpu_processes(
63132
# Skip if there are no processes or the process info is a string which indicates no processes
64133
continue
65134

66-
process_count = len(process.process_list) # Number of processes for GPU
135+
process_count = len(process.process_list)
67136
if process_count > max_num_processes:
68137
gpu_exceeds_num_processes[process.gpu] = process_count
69138

@@ -78,6 +147,85 @@ def expected_gpu_processes(
78147
console_log=True,
79148
)
80149

150+
def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]):
151+
"""Check the static data for all GPUs. If the static data is not consistent, log an error event"""
152+
consistancy_data: dict[str, set[str] | set[int]] = {
153+
"market_name": {gpu.asic.market_name for gpu in amdsmi_static_data},
154+
"vendor_id": {gpu.asic.vendor_id for gpu in amdsmi_static_data},
155+
"vendor_name": {gpu.asic.vendor_name for gpu in amdsmi_static_data},
156+
"subvendor_id": {gpu.asic.subvendor_id for gpu in amdsmi_static_data},
157+
"subsystem_id": {gpu.asic.subsystem_id for gpu in amdsmi_static_data},
158+
"device_id": {gpu.asic.device_id for gpu in amdsmi_static_data},
159+
"rev_id": {gpu.asic.rev_id for gpu in amdsmi_static_data},
160+
"num_compute_units": {gpu.asic.num_compute_units for gpu in amdsmi_static_data},
161+
"target_graphics_version": {
162+
gpu.asic.target_graphics_version for gpu in amdsmi_static_data
163+
},
164+
}
165+
for key, value in consistancy_data.items():
166+
if len(value) > 1:
167+
self._log_event(
168+
category=EventCategory.PLATFORM,
169+
description=f"{key} is not consistent across all GPUs",
170+
priority=EventPriority.ERROR,
171+
data={
172+
"field": key,
173+
"non_consistent_values": value,
174+
},
175+
)
176+
177+
def check_static_data(
178+
self,
179+
amdsmi_static_data: list[AmdSmiStatic],
180+
vendor_id: str | None,
181+
subvendor_id: str | None,
182+
device_id: tuple[str | None, str | None],
183+
subsystem_id: tuple[str | None, str | None],
184+
sku_name: str,
185+
):
186+
mismatch_gpus: list[tuple[int, str, str]] = []
187+
expected_data: dict[str, str | None] = {
188+
"vendor_id": vendor_id,
189+
"subvendor_id": subvendor_id,
190+
"vendor_name": "Advanced Micro Devices Inc",
191+
"market_name": sku_name,
192+
}
193+
for gpu_data in amdsmi_static_data:
194+
for key in expected_data:
195+
collected_data: dict[str, str] = {
196+
"vendor_id": gpu_data.asic.vendor_id,
197+
"subvendor_id": gpu_data.asic.subvendor_id,
198+
"vendor_name": gpu_data.asic.vendor_name,
199+
"market_name": sku_name,
200+
}
201+
if expected_data[key] is not None:
202+
if expected_data[key] not in collected_data[key]:
203+
mismatch_gpus.append((gpu_data.gpu, key, collected_data[key]))
204+
break
205+
if device_id[0] is not None and device_id[1] is not None:
206+
if (
207+
device_id[0].upper() not in gpu_data.asic.device_id.upper()
208+
and device_id[1].upper() not in gpu_data.asic.device_id.upper()
209+
):
210+
mismatch_gpus.append((gpu_data.gpu, "device_id", gpu_data.asic.device_id))
211+
if subsystem_id[0] is not None and subsystem_id[1] is not None:
212+
if (
213+
subsystem_id[0].upper() not in gpu_data.asic.subsystem_id.upper()
214+
and subsystem_id[1].upper() not in gpu_data.asic.subsystem_id.upper()
215+
):
216+
mismatch_gpus.append((gpu_data.gpu, "subsystem_id", gpu_data.asic.subsystem_id))
217+
if mismatch_gpus:
218+
self._log_event(
219+
category=EventCategory.PLATFORM,
220+
description="amd-smi static data mismatch",
221+
priority=EventPriority.ERROR,
222+
data={
223+
"gpus": [data[0] for data in mismatch_gpus],
224+
"key": [data[1] for data in mismatch_gpus],
225+
"collected_data": [data[2] for data in mismatch_gpus],
226+
},
227+
)
228+
81229
def check_pldm_version(
82230
self,
83231
amdsmi_fw_data: list[Fw] | None,
@@ -98,9 +246,9 @@ def check_pldm_version(
98246
for fw_data in amdsmi_fw_data:
99247
gpu = fw_data.gpu
100248
for fw_info in fw_data.fw_list:
101-
if PLDM_STRING == fw_info.fw_id and expected_pldm_version != fw_info.fw_version:
249+
if PLDM_STRING == fw_info.fw_name and expected_pldm_version != fw_info.fw_version:
102250
mismatched_gpus.append(gpu)
103-
if PLDM_STRING == fw_info.fw_id:
251+
if PLDM_STRING == fw_info.fw_name:
104252
break
105253
else:
106254
pldm_missing_gpus.append(gpu)
@@ -131,21 +279,32 @@ def check_expected_memory_partition_mode(
131279
)
132280
return
133281
bad_memory_partition_mode_gpus = []
134-
for partition_current in partition_data.current_partition:
282+
for partition_current in partition_data.memory_partition:
135283
if (
136284
expected_memory_partition_mode is not None
137-
and partition_current.memory != expected_memory_partition_mode
138-
) or (
285+
and partition_current.partition_type != expected_memory_partition_mode
286+
):
287+
bad_memory_partition_mode_gpus.append(
288+
{
289+
"gpu_id": partition_current.gpu_id,
290+
"memory_partition_mode": partition_current.partition_type,
291+
}
292+
)
293+
294+
for partition_current in partition_data.compute_partition:
295+
if (
139296
expected_compute_partition_mode is not None
140-
and partition_current.accelerator_type != expected_compute_partition_mode
297+
and partition_current.partition_type != expected_compute_partition_mode
141298
):
142299
bad_memory_partition_mode_gpus.append(
143300
{
144301
"gpu_id": partition_current.gpu_id,
145-
"compute_partition_mode": partition_current.accelerator_type,
146-
"memory_partition_mode": partition_current.memory,
302+
"compute_partition_mode": partition_current.partition_type,
147303
}
148304
)
305+
306+
# accelerator currently not avaialbe in API
307+
149308
if bad_memory_partition_mode_gpus:
150309
self._log_event(
151310
category=EventCategory.PLATFORM,
@@ -163,13 +322,52 @@ def analyze_data(self, data: AmdSmiDataModel, args=None) -> TaskResult:
163322
if args is None:
164323
args = AmdSmiAnalyzerArgs()
165324

325+
if args.l0_to_recovery_count_error_threshold is None:
326+
args.l0_to_recovery_count_error_threshold = self.L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD
327+
if args.l0_to_recovery_count_warning_threshold is None:
328+
args.l0_to_recovery_count_warning_threshold = (
329+
self.L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD
330+
)
331+
166332
if args.expected_gpu_processes:
167333
self.expected_gpu_processes(data.process, args.expected_gpu_processes)
334+
335+
if data.static is None or len(data.static) == 0:
336+
self._log_event(
337+
category=EventCategory.PLATFORM,
338+
description="No AMD SMI static data available",
339+
priority=EventPriority.WARNING,
340+
data={"amdsmi_static_data": data.static},
341+
)
342+
else:
343+
if args.expected_max_power:
344+
self.check_expected_max_power(data.static, args.expected_max_power)
345+
if args.expected_driver_version:
346+
self.check_expected_driver_version(data.static, args.expected_driver_version)
168347
if args.expected_memory_partition_mode or args.expected_compute_partition_mode:
169348
self.check_expected_memory_partition_mode(
170349
data.partition,
171350
args.expected_memory_partition_mode,
172351
args.expected_compute_partition_mode,
173352
)
353+
self.static_consistancy_check(data.static)
354+
if (
355+
self.system_info.sku
356+
and args.devid_ep
357+
and args.devid_ep_vf
358+
and args.vendorid_ep
359+
and args.check_static_data
360+
) or args.check_static_data:
361+
self.check_static_data(
362+
data.static,
363+
args.vendorid_ep,
364+
args.vendorid_ep,
365+
(args.devid_ep, args.devid_ep),
366+
(args.devid_ep, args.devid_ep),
367+
sku_name=args.sku_name,
368+
)
369+
370+
if args.expected_pldm_version:
371+
self.check_pldm_version(data.firmware, args.expected_pldm_version)
174372

175373
return self.result

nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,19 @@
2525
###############################################################################
2626
from nodescraper.base import InBandDataPlugin
2727

28+
from .amdsmi_analyzer import AmdSmiAnalyzer
2829
from .amdsmi_collector import AmdSmiCollector
2930
from .amdsmidata import AmdSmiDataModel
31+
from .analyzer_args import AmdSmiAnalyzerArgs
3032

3133

32-
class AmdSmiPlugin(InBandDataPlugin[AmdSmiDataModel, None, None]):
34+
class AmdSmiPlugin(InBandDataPlugin[AmdSmiDataModel, None, AmdSmiAnalyzerArgs]):
3335
"""Plugin for collection and analysis of amdsmi data"""
3436

3537
DATA_MODEL = AmdSmiDataModel
3638

3739
COLLECTOR = AmdSmiCollector
40+
41+
ANALYZER = AmdSmiAnalyzer
42+
43+
ANALYZER_ARGS = AmdSmiAnalyzerArgs

nodescraper/plugins/inband/amdsmi/analyzer_args.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,16 @@
3030

3131
class AmdSmiAnalyzerArgs(AnalyzerArgs):
3232

33-
check_static_data: bool = False
34-
expected_gpu_processes: Optional[int] = None
35-
expected_max_power: Optional[int] = None
36-
expected_driver_version: Optional[str] = None
37-
expected_memory_partition_mode: Optional[str] = None
38-
expected_compute_partition_mode: Optional[str] = None
39-
expected_pldm_version: Optional[str] = None
40-
xgmi_speed_override: Optional[float] = None
33+
check_static_data: bool = True
34+
expected_gpu_processes: Optional[int] = 12
35+
expected_max_power: Optional[int] = 2
36+
expected_driver_version: Optional[str] = "5"
37+
expected_memory_partition_mode: Optional[str] = "test"
38+
expected_compute_partition_mode: Optional[str] = "test2"
39+
expected_pldm_version: Optional[str] = "test3"
40+
l0_to_recovery_count_error_threshold: Optional[int] = 1
41+
l0_to_recovery_count_warning_threshold: Optional[int] = 2
42+
vendorid_ep: Optional["str"] = "vendorid_ep"
43+
vendorid_ep_vf: Optional["str"] = "vendorid_ep_vf"
44+
devid_ep: Optional["str"] = "devid_ep"
45+
sku_name: Optional["str"] = "sku_name"

0 commit comments

Comments
 (0)