Skip to content

Commit 0eec9f7

Browse files
committed
addressed reviews
1 parent e13889b commit 0eec9f7

File tree

5 files changed

+186
-191
lines changed

5 files changed

+186
-191
lines changed

nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py

Lines changed: 29 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#
2525
###############################################################################
2626
from collections import defaultdict
27-
from typing import Any, Dict, List, Optional
27+
from typing import Any, Dict, List, Optional, Union
2828

2929
from nodescraper.enums import EventCategory, EventPriority
3030
from nodescraper.interfaces import DataAnalyzer
@@ -39,9 +39,6 @@ class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]):
3939

4040
DATA_MODEL = AmdSmiDataModel
4141

42-
L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3
43-
L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1
44-
4542
def check_expected_max_power(
4643
self,
4744
amdsmi_static_data: list[AmdSmiStatic],
@@ -53,7 +50,7 @@ def check_expected_max_power(
5350
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
5451
expected_max_power (int): expected max power
5552
"""
56-
incorrect_max_power_gpus: dict[int, int | str | float] = {}
53+
incorrect_max_power_gpus: dict[int, Union[int, str, float]] = {}
5754
for gpu in amdsmi_static_data:
5855
if gpu.limit is None or gpu.limit.max_power is None:
5956
self._log_event(
@@ -104,9 +101,9 @@ def check_expected_driver_version(
104101
"""
105102
bad_driver_gpus: list[int] = []
106103

107-
versions_by_gpu: dict[int, str | None] = {}
104+
versions_by_gpu: dict[int, Optional[str]] = {}
108105
for gpu in amdsmi_static_data:
109-
ver: str | None = None
106+
ver: Optional[str] = None
110107
if gpu.driver is not None:
111108
ver = gpu.driver.version
112109
versions_by_gpu[gpu.gpu] = ver
@@ -126,12 +123,12 @@ def check_expected_driver_version(
126123
)
127124

128125
def expected_gpu_processes(
129-
self, processes_data: list[Processes] | None, max_num_processes: int
126+
self, processes_data: Optional[list[Processes]], max_num_processes: int
130127
):
131128
"""Check the number of GPU processes running
132129
133130
Args:
134-
processes_data (list[Processes] | None): list of processes per GPU
131+
processes_data (Optional[list[Processes]]): list of processes per GPU
135132
max_num_processes (int): max number of expected processes
136133
"""
137134
gpu_exceeds_num_processes: dict[int, int] = {}
@@ -172,7 +169,7 @@ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]):
172169
Args:
173170
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
174171
"""
175-
consistancy_data: dict[str, set[str] | set[int]] = {
172+
consistancy_data: dict[str, Union[set[str], set[int]]] = {
176173
"market_name": {gpu.asic.market_name for gpu in amdsmi_static_data},
177174
"vendor_id": {gpu.asic.vendor_id for gpu in amdsmi_static_data},
178175
"vendor_name": {gpu.asic.vendor_name for gpu in amdsmi_static_data},
@@ -190,7 +187,7 @@ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]):
190187
self._log_event(
191188
category=EventCategory.PLATFORM,
192189
description=f"{key} is not consistent across all GPUs",
193-
priority=EventPriority.ERROR,
190+
priority=EventPriority.WARNING,
194191
data={
195192
"field": key,
196193
"non_consistent_values": value,
@@ -200,26 +197,26 @@ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]):
200197
def check_static_data(
201198
self,
202199
amdsmi_static_data: list[AmdSmiStatic],
203-
vendor_id: str | None,
204-
subvendor_id: str | None,
205-
device_id: tuple[str | None, str | None],
206-
subsystem_id: tuple[str | None, str | None],
207-
sku_name: str | None,
200+
vendor_id: Optional[str],
201+
subvendor_id: Optional[str],
202+
device_id: tuple[Optional[str], Optional[str]],
203+
subsystem_id: tuple[Optional[str], Optional[str]],
204+
sku_name: Optional[str],
208205
) -> None:
209206
"""Check expected static data
210207
211208
Args:
212209
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data
213-
vendor_id (str | None): expected vendor_id
214-
subvendor_id (str | None): expected subvendor_id
215-
device_id (tuple[str | None, str | None]): expected device_id
216-
subsystem_id (tuple[str | None, str | None]): expected subsystem_id
217-
sku_name (str | None): expected sku_name
210+
vendor_id (Optional[str]): expected vendor_id
211+
subvendor_id (Optional[str]): expected subvendor_id
212+
device_id (tuple[Optional[str], Optional[str]]): expected device_id
213+
subsystem_id (tuple[Optional[str], Optional[str]]): expected subsystem_id
214+
sku_name (Optional[str]): expected sku_name
218215
"""
219216

220217
mismatches: list[tuple[int, str, str, str]] = []
221218

222-
expected_data: dict[str, str | None] = {
219+
expected_data: Dict[str, Optional[str]] = {
223220
"vendor_id": vendor_id,
224221
"subvendor_id": subvendor_id,
225222
"vendor_name": "Advanced Micro Devices Inc",
@@ -311,14 +308,14 @@ def _format_static_mismatch_payload(
311308

312309
def check_pldm_version(
313310
self,
314-
amdsmi_fw_data: list[Fw] | None,
315-
expected_pldm_version: str | None,
311+
amdsmi_fw_data: Optional[list[Fw]],
312+
expected_pldm_version: Optional[str],
316313
):
317314
"""Check expected pldm version
318315
319316
Args:
320-
amdsmi_fw_data (list[Fw] | None): data model
321-
expected_pldm_version (str | None): expected pldm version
317+
amdsmi_fw_data (Optional[list[Fw]]): data model
318+
expected_pldm_version (Optional[str]): expected pldm version
322319
"""
323320
PLDM_STRING = "PLDM_BUNDLE"
324321
if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0:
@@ -355,16 +352,16 @@ def check_pldm_version(
355352

356353
def check_expected_memory_partition_mode(
357354
self,
358-
partition_data: Partition | None,
359-
expected_memory_partition_mode: str | None,
360-
expected_compute_partition_mode: str | None,
355+
partition_data: Optional[Partition],
356+
expected_memory_partition_mode: Optional[str],
357+
expected_compute_partition_mode: Optional[str],
361358
):
362359
"""Check expected mem partition mode
363360
364361
Args:
365-
partition_data (Partition | None): data model
366-
expected_memory_partition_mode (str | None): expected mem partition mode
367-
expected_compute_partition_mode (str | None): expected compute partition mode
362+
partition_data (Optional[Partition]): data model
363+
expected_memory_partition_mode (Optional[str]): expected mem partition mode
364+
expected_compute_partition_mode (Optional[str]): expected compute partition mode
368365
"""
369366
if partition_data is None:
370367
self._log_event(
@@ -429,13 +426,6 @@ def analyze_data(
429426
if args is None:
430427
args = AmdSmiAnalyzerArgs()
431428

432-
if args.l0_to_recovery_count_error_threshold is None:
433-
args.l0_to_recovery_count_error_threshold = self.L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD
434-
if args.l0_to_recovery_count_warning_threshold is None:
435-
args.l0_to_recovery_count_warning_threshold = (
436-
self.L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD
437-
)
438-
439429
if args.expected_gpu_processes:
440430
self.expected_gpu_processes(data.process, args.expected_gpu_processes)
441431

0 commit comments

Comments
 (0)