Skip to content

Commit 6406994

Browse files
committed
docstring + mypy
1 parent 1e456c3 commit 6406994

File tree

3 files changed

+218
-29
lines changed

3 files changed

+218
-29
lines changed

nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py

Lines changed: 77 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#
2525
###############################################################################
2626
from collections import defaultdict
27-
from typing import Any, Dict, List
27+
from typing import Any, Dict, List, Optional
2828

2929
from nodescraper.enums import EventCategory, EventPriority
3030
from nodescraper.interfaces import DataAnalyzer
@@ -47,7 +47,12 @@ def check_expected_max_power(
4747
amdsmi_static_data: list[AmdSmiStatic],
4848
expected_max_power: int,
4949
):
50-
"""Check the max power for all GPUs. If the max power is not as expected, log an error event"""
50+
"""Check against expected max power
51+
52+
Args:
53+
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
54+
expected_max_power (int): expected max power
55+
"""
5156
incorrect_max_power_gpus: dict[int, int | str | float] = {}
5257
for gpu in amdsmi_static_data:
5358
if gpu.limit is None or gpu.limit.max_power is None:
@@ -91,6 +96,12 @@ def check_expected_driver_version(
9196
amdsmi_static_data: list[AmdSmiStatic],
9297
expected_driver_version: str,
9398
) -> None:
99+
"""Check expectecd driver version
100+
101+
Args:
102+
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
103+
expected_driver_version (str): expected driver version
104+
"""
94105
bad_driver_gpus: list[int] = []
95106

96107
versions_by_gpu: dict[int, str | None] = {}
@@ -117,8 +128,12 @@ def check_expected_driver_version(
117128
def expected_gpu_processes(
118129
self, processes_data: list[Processes] | None, max_num_processes: int
119130
):
120-
"""Check the number of GPU processes running. If the number of processes is greater than the expected
121-
number of processes, log an error event"""
131+
"""Check the number of GPU processes running
132+
133+
Args:
134+
processes_data (list[Processes] | None): list of processes per GPU
135+
max_num_processes (int): max number of expected processes
136+
"""
122137
gpu_exceeds_num_processes: dict[int, int] = {}
123138
if processes_data is None or len(processes_data) == 0:
124139
self._log_event(
@@ -133,7 +148,7 @@ def expected_gpu_processes(
133148
if len(process.process_list) == 0 or isinstance(
134149
process.process_list[0].process_info, str
135150
):
136-
# Skip if there are no processes or the process info is a string which indicates no processes
151+
# Skip if there are no processes
137152
continue
138153

139154
process_count = len(process.process_list)
@@ -152,7 +167,11 @@ def expected_gpu_processes(
152167
)
153168

154169
def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]):
155-
"""Check the static data for all GPUs. If the static data is not consistent, log an error event"""
170+
"""Check consistency of expected data
171+
172+
Args:
173+
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
174+
"""
156175
consistancy_data: dict[str, set[str] | set[int]] = {
157176
"market_name": {gpu.asic.market_name for gpu in amdsmi_static_data},
158177
"vendor_id": {gpu.asic.vendor_id for gpu in amdsmi_static_data},
@@ -185,9 +204,21 @@ def check_static_data(
185204
subvendor_id: str | None,
186205
device_id: tuple[str | None, str | None],
187206
subsystem_id: tuple[str | None, str | None],
188-
sku_name: str,
207+
sku_name: str | None,
189208
) -> None:
209+
"""Check expected static data
210+
211+
Args:
212+
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data
213+
vendor_id (str | None): expected vendor_id
214+
subvendor_id (str | None): expected subvendor_id
215+
device_id (tuple[str | None, str | None]): expected device_id
216+
subsystem_id (tuple[str | None, str | None]): expected subsystem_id
217+
sku_name (str | None): expected sku_name
218+
"""
219+
190220
mismatches: list[tuple[int, str, str, str]] = []
221+
191222
expected_data: dict[str, str | None] = {
192223
"vendor_id": vendor_id,
193224
"subvendor_id": subvendor_id,
@@ -200,7 +231,7 @@ def check_static_data(
200231
"vendor_id": gpu_data.asic.vendor_id,
201232
"subvendor_id": gpu_data.asic.subvendor_id,
202233
"vendor_name": gpu_data.asic.vendor_name,
203-
"market_name": sku_name,
234+
"market_name": gpu_data.asic.market_name,
204235
}
205236

206237
for key, expected in expected_data.items():
@@ -249,7 +280,14 @@ def _format_static_mismatch_payload(
249280
self,
250281
mismatches: List[tuple[int, str, str, str]],
251282
) -> Dict[str, Any]:
252-
""" """
283+
"""Helper function for pretty printing mismatch in expected data
284+
285+
Args:
286+
mismatches (List[tuple[int, str, str, str]]): mismatched data per GPU
287+
288+
Returns:
289+
Dict[str, Any]: dict of mismatched data per GPU
290+
"""
253291
per_gpu: Dict[int, List[Dict[str, str]]] = defaultdict(list)
254292
field_set: set[str] = set()
255293

@@ -276,7 +314,12 @@ def check_pldm_version(
276314
amdsmi_fw_data: list[Fw] | None,
277315
expected_pldm_version: str | None,
278316
):
279-
"""Check the PLDM version for all GPUs. If the PLDM version is not as expected, log an error event for which GPUs don't have a match"""
317+
"""Check expected pldm version
318+
319+
Args:
320+
amdsmi_fw_data (list[Fw] | None): data model
321+
expected_pldm_version (str | None): expected pldm version
322+
"""
280323
PLDM_STRING = "PLDM_BUNDLE"
281324
if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0:
282325
self._log_event(
@@ -316,6 +359,13 @@ def check_expected_memory_partition_mode(
316359
expected_memory_partition_mode: str | None,
317360
expected_compute_partition_mode: str | None,
318361
):
362+
"""Check expected mem partition mode
363+
364+
Args:
365+
partition_data (Partition | None): data model
366+
expected_memory_partition_mode (str | None): expected mem partition mode
367+
expected_compute_partition_mode (str | None): expected compute partition mode
368+
"""
319369
if partition_data is None:
320370
self._log_event(
321371
category=EventCategory.PLATFORM,
@@ -336,15 +386,15 @@ def check_expected_memory_partition_mode(
336386
}
337387
)
338388

339-
for partition_current in partition_data.compute_partition:
389+
for compute_current in partition_data.compute_partition:
340390
if (
341391
expected_compute_partition_mode is not None
342-
and partition_current.partition_type != expected_compute_partition_mode
392+
and compute_current.partition_type != expected_compute_partition_mode
343393
):
344394
bad_memory_partition_mode_gpus.append(
345395
{
346-
"gpu_id": partition_current.gpu_id,
347-
"compute_partition_mode": partition_current.partition_type,
396+
"gpu_id": compute_current.gpu_id,
397+
"compute_partition_mode": compute_current.partition_type,
348398
}
349399
)
350400

@@ -362,7 +412,19 @@ def check_expected_memory_partition_mode(
362412
},
363413
)
364414

365-
def analyze_data(self, data: AmdSmiDataModel, args=None) -> TaskResult:
415+
def analyze_data(
416+
self, data: AmdSmiDataModel, args: Optional[AmdSmiAnalyzerArgs] = None
417+
) -> TaskResult:
418+
"""Analyze the amdsmi data against expected data
419+
420+
Args:
421+
data (AmdSmiDataModel): the AmdSmi data model
422+
args (_type_, optional): optional AmdSmi analyzer args. Defaults to None.
423+
424+
Returns:
425+
TaskResult: the result of the analysis indicating weather the AmdSmi data model
426+
matched the expected data
427+
"""
366428

367429
if args is None:
368430
args = AmdSmiAnalyzerArgs()

0 commit comments

Comments
 (0)