2929from nodescraper .interfaces import DataAnalyzer
3030from nodescraper .models import TaskResult
3131
32- from .amdsmidata import AmdSmiDataModel , Fw , Partition , Processes
32+ from .amdsmidata import AmdSmiDataModel , AmdSmiStatic , Fw , Partition , Processes
3333from .analyzer_args import AmdSmiAnalyzerArgs
3434
3535
@@ -41,6 +41,75 @@ class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]):
4141 L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3
4242 L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1
4343
44+ def check_expected_max_power (
45+ self ,
46+ amdsmi_static_data : list [AmdSmiStatic ],
47+ expected_max_power : int ,
48+ ):
49+ """Check the max power for all GPUs. If the max power is not as expected, log an error event"""
50+ incorrect_max_power_gpus : dict [int , int | str | float ] = {}
51+ for gpu in amdsmi_static_data :
52+ if gpu .limit is None or gpu .limit .max_power is None :
53+ self ._log_event (
54+ category = EventCategory .PLATFORM ,
55+ description = f"GPU: { gpu .gpu } has no max power limit set" ,
56+ priority = EventPriority .WARNING ,
57+ data = {"gpu" : gpu .gpu },
58+ )
59+ continue
60+ max_power_value = gpu .limit .max_power .value
61+ try :
62+ max_power_float = float (max_power_value )
63+ except ValueError :
64+ self ._log_event (
65+ category = EventCategory .PLATFORM ,
66+ description = f"GPU: { gpu .gpu } has an invalid max power limit" ,
67+ priority = EventPriority .ERROR ,
68+ data = {
69+ "gpu" : gpu .gpu ,
70+ "max_power_value" : max_power_value ,
71+ },
72+ )
73+ continue
74+ if max_power_float != expected_max_power :
75+ incorrect_max_power_gpus [gpu .gpu ] = max_power_float
76+ if incorrect_max_power_gpus :
77+ self ._log_event (
78+ category = EventCategory .PLATFORM ,
79+ description = "Max power mismatch" ,
80+ priority = EventPriority .ERROR ,
81+ data = {
82+ "gpus" : list (incorrect_max_power_gpus .keys ()),
83+ "max_power_values" : incorrect_max_power_gpus ,
84+ "expected_max_power" : expected_max_power ,
85+ },
86+ )
87+
88+ def check_expected_driver_version (
89+ self ,
90+ amdsmi_static_data : list [AmdSmiStatic ],
91+ expected_driver_version : str ,
92+ ):
93+ bad_driver_gpus = []
94+ for gpu in amdsmi_static_data :
95+ if gpu .driver .version != expected_driver_version :
96+ bad_driver_gpus .append (gpu .gpu )
97+ if bad_driver_gpus :
98+ self ._log_event (
99+ category = EventCategory .PLATFORM ,
100+ description = "Driver Version Mismatch" ,
101+ priority = EventPriority .ERROR ,
102+ data = {
103+ "gpus" : bad_driver_gpus ,
104+ "driver_version" : {
105+ gpu .gpu : gpu .driver .version
106+ for gpu in amdsmi_static_data
107+ if gpu .gpu in bad_driver_gpus
108+ },
109+ "expected_driver_version" : expected_driver_version ,
110+ },
111+ )
112+
44113 def expected_gpu_processes (
45114 self , processes_data : list [Processes ] | None , max_num_processes : int
46115 ):
@@ -63,7 +132,7 @@ def expected_gpu_processes(
63132 # Skip if there are no processes or the process info is a string which indicates no processes
64133 continue
65134
66- process_count = len (process .process_list ) # Number of processes for GPU
135+ process_count = len (process .process_list )
67136 if process_count > max_num_processes :
68137 gpu_exceeds_num_processes [process .gpu ] = process_count
69138
@@ -78,6 +147,85 @@ def expected_gpu_processes(
78147 console_log = True ,
79148 )
80149
150+ def static_consistancy_check (self , amdsmi_static_data : list [AmdSmiStatic ]):
151+ """Check the static data for all GPUs. If the static data is not consistent, log an error event"""
152+ consistancy_data : dict [str , set [str ] | set [int ]] = {
153+ "market_name" : {gpu .asic .market_name for gpu in amdsmi_static_data },
154+ "vendor_id" : {gpu .asic .vendor_id for gpu in amdsmi_static_data },
155+ "vendor_name" : {gpu .asic .vendor_name for gpu in amdsmi_static_data },
156+ "subvendor_id" : {gpu .asic .subvendor_id for gpu in amdsmi_static_data },
157+ "subsystem_id" : {gpu .asic .subsystem_id for gpu in amdsmi_static_data },
158+ "device_id" : {gpu .asic .device_id for gpu in amdsmi_static_data },
159+ "rev_id" : {gpu .asic .rev_id for gpu in amdsmi_static_data },
160+ "num_compute_units" : {gpu .asic .num_compute_units for gpu in amdsmi_static_data },
161+ "target_graphics_version" : {
162+ gpu .asic .target_graphics_version for gpu in amdsmi_static_data
163+ },
164+ }
165+ for key , value in consistancy_data .items ():
166+ if len (value ) > 1 :
167+ self ._log_event (
168+ category = EventCategory .PLATFORM ,
169+ description = f"{ key } is not consistent across all GPUs" ,
170+ priority = EventPriority .ERROR ,
171+ data = {
172+ "field" : key ,
173+ "non_consistent_values" : value ,
174+ },
175+ )
176+
177+ def check_static_data (
178+ self ,
179+ amdsmi_static_data : list [AmdSmiStatic ],
180+ vendor_id : str | None ,
181+ subvendor_id : str | None ,
182+ device_id : tuple [str | None , str | None ],
183+ subsystem_id : tuple [str | None , str | None ],
184+ sku_name : str ,
185+ ):
186+ mismatch_gpus : list [tuple [int , str , str ]] = []
187+ expected_data : dict [str , str | None ] = {
188+ "vendor_id" : vendor_id ,
189+ "subvendor_id" : subvendor_id ,
190+ "vendor_name" : "Advanced Micro Devices Inc" ,
191+ "market_name" : sku_name ,
192+ }
193+ for gpu_data in amdsmi_static_data :
194+ for key in expected_data :
195+ collected_data : dict [str , str ] = {
196+ "vendor_id" : gpu_data .asic .vendor_id ,
197+ "subvendor_id" : gpu_data .asic .subvendor_id ,
198+ "vendor_name" : gpu_data .asic .vendor_name ,
199+ "market_name" : sku_name ,
200+ }
201+ if expected_data [key ] is not None :
202+ if expected_data [key ] not in collected_data [key ]:
203+ mismatch_gpus .append ((gpu_data .gpu , key , collected_data [key ]))
204+ break
205+ if device_id [0 ] is not None and device_id [1 ] is not None :
206+ if (
207+ device_id [0 ].upper () not in gpu_data .asic .device_id .upper ()
208+ and device_id [1 ].upper () not in gpu_data .asic .device_id .upper ()
209+ ):
210+ mismatch_gpus .append ((gpu_data .gpu , "device_id" , gpu_data .asic .device_id ))
211+ if subsystem_id [0 ] is not None and subsystem_id [1 ] is not None :
212+ if (
213+ subsystem_id [0 ].upper () not in gpu_data .asic .subsystem_id .upper ()
214+ and subsystem_id [1 ].upper () not in gpu_data .asic .subsystem_id .upper ()
215+ ):
216+ mismatch_gpus .append ((gpu_data .gpu , "subsystem_id" , gpu_data .asic .subsystem_id ))
217+ if mismatch_gpus :
218+ self ._log_event (
219+ category = EventCategory .PLATFORM ,
220+ description = "amd-smi static data mismatch" ,
221+ priority = EventPriority .ERROR ,
222+ data = {
223+ "gpus" : [data [0 ] for data in mismatch_gpus ],
224+ "key" : [data [1 ] for data in mismatch_gpus ],
225+ "collected_data" : [data [2 ] for data in mismatch_gpus ],
226+ },
227+ )
228+
81229 def check_pldm_version (
82230 self ,
83231 amdsmi_fw_data : list [Fw ] | None ,
@@ -98,9 +246,9 @@ def check_pldm_version(
98246 for fw_data in amdsmi_fw_data :
99247 gpu = fw_data .gpu
100248 for fw_info in fw_data .fw_list :
101- if PLDM_STRING == fw_info .fw_id and expected_pldm_version != fw_info .fw_version :
249+ if PLDM_STRING == fw_info .fw_name and expected_pldm_version != fw_info .fw_version :
102250 mismatched_gpus .append (gpu )
103- if PLDM_STRING == fw_info .fw_id :
251+ if PLDM_STRING == fw_info .fw_name :
104252 break
105253 else :
106254 pldm_missing_gpus .append (gpu )
@@ -131,21 +279,32 @@ def check_expected_memory_partition_mode(
131279 )
132280 return
133281 bad_memory_partition_mode_gpus = []
134- for partition_current in partition_data .current_partition :
282+ for partition_current in partition_data .memory_partition :
135283 if (
136284 expected_memory_partition_mode is not None
137- and partition_current .memory != expected_memory_partition_mode
138- ) or (
285+ and partition_current .partition_type != expected_memory_partition_mode
286+ ):
287+ bad_memory_partition_mode_gpus .append (
288+ {
289+ "gpu_id" : partition_current .gpu_id ,
290+ "memory_partition_mode" : partition_current .partition_type ,
291+ }
292+ )
293+
294+ for partition_current in partition_data .compute_partition :
295+ if (
139296 expected_compute_partition_mode is not None
140- and partition_current .accelerator_type != expected_compute_partition_mode
297+ and partition_current .partition_type != expected_compute_partition_mode
141298 ):
142299 bad_memory_partition_mode_gpus .append (
143300 {
144301 "gpu_id" : partition_current .gpu_id ,
145- "compute_partition_mode" : partition_current .accelerator_type ,
146- "memory_partition_mode" : partition_current .memory ,
302+ "compute_partition_mode" : partition_current .partition_type ,
147303 }
148304 )
305+
306+ # accelerator currently not avaialbe in API
307+
149308 if bad_memory_partition_mode_gpus :
150309 self ._log_event (
151310 category = EventCategory .PLATFORM ,
@@ -163,13 +322,52 @@ def analyze_data(self, data: AmdSmiDataModel, args=None) -> TaskResult:
163322 if args is None :
164323 args = AmdSmiAnalyzerArgs ()
165324
325+ if args .l0_to_recovery_count_error_threshold is None :
326+ args .l0_to_recovery_count_error_threshold = self .L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD
327+ if args .l0_to_recovery_count_warning_threshold is None :
328+ args .l0_to_recovery_count_warning_threshold = (
329+ self .L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD
330+ )
331+
166332 if args .expected_gpu_processes :
167333 self .expected_gpu_processes (data .process , args .expected_gpu_processes )
334+
335+ if data .static is None or len (data .static ) == 0 :
336+ self ._log_event (
337+ category = EventCategory .PLATFORM ,
338+ description = "No AMD SMI static data available" ,
339+ priority = EventPriority .WARNING ,
340+ data = {"amdsmi_static_data" : data .static },
341+ )
342+ else :
343+ if args .expected_max_power :
344+ self .check_expected_max_power (data .static , args .expected_max_power )
345+ if args .expected_driver_version :
346+ self .check_expected_driver_version (data .static , args .expected_driver_version )
168347 if args .expected_memory_partition_mode or args .expected_compute_partition_mode :
169348 self .check_expected_memory_partition_mode (
170349 data .partition ,
171350 args .expected_memory_partition_mode ,
172351 args .expected_compute_partition_mode ,
173352 )
353+ self .static_consistancy_check (data .static )
354+ if (
355+ self .system_info .sku
356+ and args .devid_ep
357+ and args .devid_ep_vf
358+ and args .vendorid_ep
359+ and args .check_static_data
360+ ) or args .check_static_data :
361+ self .check_static_data (
362+ data .static ,
363+ args .vendorid_ep ,
364+ args .vendorid_ep ,
365+ (args .devid_ep , args .devid_ep ),
366+ (args .devid_ep , args .devid_ep ),
367+ sku_name = args .sku_name ,
368+ )
369+
370+ if args .expected_pldm_version :
371+ self .check_pldm_version (data .firmware , args .expected_pldm_version )
174372
175373 return self .result
0 commit comments