2424#
2525###############################################################################
2626from collections import defaultdict
27- from typing import Any , Dict , List , Optional
27+ from typing import Any , Dict , List , Optional , Union
2828
2929from nodescraper .enums import EventCategory , EventPriority
3030from nodescraper .interfaces import DataAnalyzer
@@ -39,9 +39,6 @@ class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]):
3939
4040 DATA_MODEL = AmdSmiDataModel
4141
42- L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3
43- L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1
44-
4542 def check_expected_max_power (
4643 self ,
4744 amdsmi_static_data : list [AmdSmiStatic ],
@@ -53,7 +50,7 @@ def check_expected_max_power(
5350 amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
5451 expected_max_power (int): expected max power
5552 """
56- incorrect_max_power_gpus : dict [int , int | str | float ] = {}
53+ incorrect_max_power_gpus : dict [int , Union [ int , str , float ] ] = {}
5754 for gpu in amdsmi_static_data :
5855 if gpu .limit is None or gpu .limit .max_power is None :
5956 self ._log_event (
@@ -104,9 +101,9 @@ def check_expected_driver_version(
104101 """
105102 bad_driver_gpus : list [int ] = []
106103
107- versions_by_gpu : dict [int , str | None ] = {}
104+ versions_by_gpu : dict [int , Optional [ str ] ] = {}
108105 for gpu in amdsmi_static_data :
109- ver : str | None = None
106+ ver : Optional [ str ] = None
110107 if gpu .driver is not None :
111108 ver = gpu .driver .version
112109 versions_by_gpu [gpu .gpu ] = ver
@@ -126,12 +123,12 @@ def check_expected_driver_version(
126123 )
127124
128125 def expected_gpu_processes (
129- self , processes_data : list [Processes ] | None , max_num_processes : int
126+ self , processes_data : Optional [ list [Processes ]] , max_num_processes : int
130127 ):
131128 """Check the number of GPU processes running
132129
133130 Args:
134- processes_data (list[Processes] | None ): list of processes per GPU
131+ processes_data (Optional[ list[Processes]] ): list of processes per GPU
135132 max_num_processes (int): max number of expected processes
136133 """
137134 gpu_exceeds_num_processes : dict [int , int ] = {}
@@ -172,7 +169,7 @@ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]):
172169 Args:
173170 amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
174171 """
175- consistancy_data : dict [str , set [str ] | set [int ]] = {
172+ consistancy_data : dict [str , Union [ set [str ], set [int ] ]] = {
176173 "market_name" : {gpu .asic .market_name for gpu in amdsmi_static_data },
177174 "vendor_id" : {gpu .asic .vendor_id for gpu in amdsmi_static_data },
178175 "vendor_name" : {gpu .asic .vendor_name for gpu in amdsmi_static_data },
@@ -190,7 +187,7 @@ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]):
190187 self ._log_event (
191188 category = EventCategory .PLATFORM ,
192189 description = f"{ key } is not consistent across all GPUs" ,
193- priority = EventPriority .ERROR ,
190+ priority = EventPriority .WARNING ,
194191 data = {
195192 "field" : key ,
196193 "non_consistent_values" : value ,
@@ -200,26 +197,26 @@ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]):
200197 def check_static_data (
201198 self ,
202199 amdsmi_static_data : list [AmdSmiStatic ],
203- vendor_id : str | None ,
204- subvendor_id : str | None ,
205- device_id : tuple [str | None , str | None ],
206- subsystem_id : tuple [str | None , str | None ],
207- sku_name : str | None ,
200+ vendor_id : Optional [ str ] ,
201+ subvendor_id : Optional [ str ] ,
202+ device_id : tuple [Optional [ str ], Optional [ str ] ],
203+ subsystem_id : tuple [Optional [ str ], Optional [ str ] ],
204+ sku_name : Optional [ str ] ,
208205 ) -> None :
209206 """Check expected static data
210207
211208 Args:
212209 amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data
213- vendor_id (str | None ): expected vendor_id
214- subvendor_id (str | None ): expected subvendor_id
215- device_id (tuple[str | None, str | None ]): expected device_id
216- subsystem_id (tuple[str | None, str | None ]): expected subsystem_id
217- sku_name (str | None ): expected sku_name
210+ vendor_id (Optional[ str] ): expected vendor_id
211+ subvendor_id (Optional[ str] ): expected subvendor_id
212+ device_id (tuple[Optional[ str], Optional[ str] ]): expected device_id
213+ subsystem_id (tuple[Optional[ str], Optional[ str] ]): expected subsystem_id
214+ sku_name (Optional[ str] ): expected sku_name
218215 """
219216
220217 mismatches : list [tuple [int , str , str , str ]] = []
221218
222- expected_data : dict [str , str | None ] = {
219+ expected_data : Dict [str , Optional [ str ] ] = {
223220 "vendor_id" : vendor_id ,
224221 "subvendor_id" : subvendor_id ,
225222 "vendor_name" : "Advanced Micro Devices Inc" ,
@@ -311,14 +308,14 @@ def _format_static_mismatch_payload(
311308
312309 def check_pldm_version (
313310 self ,
314- amdsmi_fw_data : list [Fw ] | None ,
315- expected_pldm_version : str | None ,
311+ amdsmi_fw_data : Optional [ list [Fw ]] ,
312+ expected_pldm_version : Optional [ str ] ,
316313 ):
317314 """Check expected pldm version
318315
319316 Args:
320- amdsmi_fw_data (list[Fw] | None ): data model
321- expected_pldm_version (str | None ): expected pldm version
317+ amdsmi_fw_data (Optional[ list[Fw]] ): data model
318+ expected_pldm_version (Optional[ str] ): expected pldm version
322319 """
323320 PLDM_STRING = "PLDM_BUNDLE"
324321 if amdsmi_fw_data is None or len (amdsmi_fw_data ) == 0 :
@@ -355,16 +352,16 @@ def check_pldm_version(
355352
356353 def check_expected_memory_partition_mode (
357354 self ,
358- partition_data : Partition | None ,
359- expected_memory_partition_mode : str | None ,
360- expected_compute_partition_mode : str | None ,
355+ partition_data : Optional [ Partition ] ,
356+ expected_memory_partition_mode : Optional [ str ] ,
357+ expected_compute_partition_mode : Optional [ str ] ,
361358 ):
362359 """Check expected mem partition mode
363360
364361 Args:
365- partition_data (Partition | None ): data model
366- expected_memory_partition_mode (str | None ): expected mem partition mode
367- expected_compute_partition_mode (str | None ): expected compute partition mode
362+ partition_data (Optional[ Partition] ): data model
363+ expected_memory_partition_mode (Optional[ str] ): expected mem partition mode
364+ expected_compute_partition_mode (Optional[ str] ): expected compute partition mode
368365 """
369366 if partition_data is None :
370367 self ._log_event (
@@ -429,13 +426,6 @@ def analyze_data(
429426 if args is None :
430427 args = AmdSmiAnalyzerArgs ()
431428
432- if args .l0_to_recovery_count_error_threshold is None :
433- args .l0_to_recovery_count_error_threshold = self .L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD
434- if args .l0_to_recovery_count_warning_threshold is None :
435- args .l0_to_recovery_count_warning_threshold = (
436- self .L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD
437- )
438-
439429 if args .expected_gpu_processes :
440430 self .expected_gpu_processes (data .process , args .expected_gpu_processes )
441431
0 commit comments