2424#
2525###############################################################################
2626from collections import defaultdict
27- from typing import Any , Dict , List
27+ from typing import Any , Dict , List , Optional
2828
2929from nodescraper .enums import EventCategory , EventPriority
3030from nodescraper .interfaces import DataAnalyzer
@@ -47,7 +47,12 @@ def check_expected_max_power(
4747 amdsmi_static_data : list [AmdSmiStatic ],
4848 expected_max_power : int ,
4949 ):
50- """Check the max power for all GPUs. If the max power is not as expected, log an error event"""
50+ """Check against expected max power
51+
52+ Args:
53+ amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
54+ expected_max_power (int): expected max power
55+ """
5156 incorrect_max_power_gpus : dict [int , int | str | float ] = {}
5257 for gpu in amdsmi_static_data :
5358 if gpu .limit is None or gpu .limit .max_power is None :
@@ -91,6 +96,12 @@ def check_expected_driver_version(
9196 amdsmi_static_data : list [AmdSmiStatic ],
9297 expected_driver_version : str ,
9398 ) -> None :
99+ """Check expectecd driver version
100+
101+ Args:
102+ amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
103+ expected_driver_version (str): expected driver version
104+ """
94105 bad_driver_gpus : list [int ] = []
95106
96107 versions_by_gpu : dict [int , str | None ] = {}
@@ -117,8 +128,12 @@ def check_expected_driver_version(
117128 def expected_gpu_processes (
118129 self , processes_data : list [Processes ] | None , max_num_processes : int
119130 ):
120- """Check the number of GPU processes running. If the number of processes is greater than the expected
121- number of processes, log an error event"""
131+ """Check the number of GPU processes running
132+
133+ Args:
134+ processes_data (list[Processes] | None): list of processes per GPU
135+ max_num_processes (int): max number of expected processes
136+ """
122137 gpu_exceeds_num_processes : dict [int , int ] = {}
123138 if processes_data is None or len (processes_data ) == 0 :
124139 self ._log_event (
@@ -133,7 +148,7 @@ def expected_gpu_processes(
133148 if len (process .process_list ) == 0 or isinstance (
134149 process .process_list [0 ].process_info , str
135150 ):
136- # Skip if there are no processes or the process info is a string which indicates no processes
151+ # Skip if there are no processes
137152 continue
138153
139154 process_count = len (process .process_list )
@@ -152,7 +167,11 @@ def expected_gpu_processes(
152167 )
153168
154169 def static_consistancy_check (self , amdsmi_static_data : list [AmdSmiStatic ]):
155- """Check the static data for all GPUs. If the static data is not consistent, log an error event"""
170+ """Check consistency of expected data
171+
172+ Args:
173+ amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
174+ """
156175 consistancy_data : dict [str , set [str ] | set [int ]] = {
157176 "market_name" : {gpu .asic .market_name for gpu in amdsmi_static_data },
158177 "vendor_id" : {gpu .asic .vendor_id for gpu in amdsmi_static_data },
@@ -185,9 +204,21 @@ def check_static_data(
185204 subvendor_id : str | None ,
186205 device_id : tuple [str | None , str | None ],
187206 subsystem_id : tuple [str | None , str | None ],
188- sku_name : str ,
207+ sku_name : str | None ,
189208 ) -> None :
209+ """Check expected static data
210+
211+ Args:
212+ amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data
213+ vendor_id (str | None): expected vendor_id
214+ subvendor_id (str | None): expected subvendor_id
215+ device_id (tuple[str | None, str | None]): expected device_id
216+ subsystem_id (tuple[str | None, str | None]): expected subsystem_id
217+ sku_name (str | None): expected sku_name
218+ """
219+
190220 mismatches : list [tuple [int , str , str , str ]] = []
221+
191222 expected_data : dict [str , str | None ] = {
192223 "vendor_id" : vendor_id ,
193224 "subvendor_id" : subvendor_id ,
@@ -200,7 +231,7 @@ def check_static_data(
200231 "vendor_id" : gpu_data .asic .vendor_id ,
201232 "subvendor_id" : gpu_data .asic .subvendor_id ,
202233 "vendor_name" : gpu_data .asic .vendor_name ,
203- "market_name" : sku_name ,
234+ "market_name" : gpu_data . asic . market_name ,
204235 }
205236
206237 for key , expected in expected_data .items ():
@@ -249,7 +280,14 @@ def _format_static_mismatch_payload(
249280 self ,
250281 mismatches : List [tuple [int , str , str , str ]],
251282 ) -> Dict [str , Any ]:
252- """ """
283+ """Helper function for pretty printing mismatch in expected data
284+
285+ Args:
286+ mismatches (List[tuple[int, str, str, str]]): mismatched data per GPU
287+
288+ Returns:
289+ Dict[str, Any]: dict of mismatched data per GPU
290+ """
253291 per_gpu : Dict [int , List [Dict [str , str ]]] = defaultdict (list )
254292 field_set : set [str ] = set ()
255293
@@ -276,7 +314,12 @@ def check_pldm_version(
276314 amdsmi_fw_data : list [Fw ] | None ,
277315 expected_pldm_version : str | None ,
278316 ):
279- """Check the PLDM version for all GPUs. If the PLDM version is not as expected, log an error event for which GPUs don't have a match"""
317+ """Check expected pldm version
318+
319+ Args:
320+ amdsmi_fw_data (list[Fw] | None): data model
321+ expected_pldm_version (str | None): expected pldm version
322+ """
280323 PLDM_STRING = "PLDM_BUNDLE"
281324 if amdsmi_fw_data is None or len (amdsmi_fw_data ) == 0 :
282325 self ._log_event (
@@ -316,6 +359,13 @@ def check_expected_memory_partition_mode(
316359 expected_memory_partition_mode : str | None ,
317360 expected_compute_partition_mode : str | None ,
318361 ):
362+ """Check expected mem partition mode
363+
364+ Args:
365+ partition_data (Partition | None): data model
366+ expected_memory_partition_mode (str | None): expected mem partition mode
367+ expected_compute_partition_mode (str | None): expected compute partition mode
368+ """
319369 if partition_data is None :
320370 self ._log_event (
321371 category = EventCategory .PLATFORM ,
@@ -336,15 +386,15 @@ def check_expected_memory_partition_mode(
336386 }
337387 )
338388
339- for partition_current in partition_data .compute_partition :
389+ for compute_current in partition_data .compute_partition :
340390 if (
341391 expected_compute_partition_mode is not None
342- and partition_current .partition_type != expected_compute_partition_mode
392+ and compute_current .partition_type != expected_compute_partition_mode
343393 ):
344394 bad_memory_partition_mode_gpus .append (
345395 {
346- "gpu_id" : partition_current .gpu_id ,
347- "compute_partition_mode" : partition_current .partition_type ,
396+ "gpu_id" : compute_current .gpu_id ,
397+ "compute_partition_mode" : compute_current .partition_type ,
348398 }
349399 )
350400
@@ -362,7 +412,19 @@ def check_expected_memory_partition_mode(
362412 },
363413 )
364414
365- def analyze_data (self , data : AmdSmiDataModel , args = None ) -> TaskResult :
415+ def analyze_data (
416+ self , data : AmdSmiDataModel , args : Optional [AmdSmiAnalyzerArgs ] = None
417+ ) -> TaskResult :
418+ """Analyze the amdsmi data against expected data
419+
420+ Args:
421+ data (AmdSmiDataModel): the AmdSmi data model
422+ args (_type_, optional): optional AmdSmi analyzer args. Defaults to None.
423+
424+ Returns:
425+ TaskResult: the result of the analysis indicating weather the AmdSmi data model
426+ matched the expected data
427+ """
366428
367429 if args is None :
368430 args = AmdSmiAnalyzerArgs ()
0 commit comments