Skip to content

Commit 08ed3f0

Browse files
committed
filled in data for AmdSmiStatic, clock is left
1 parent 7094979 commit 08ed3f0

File tree

3 files changed

+190
-8
lines changed

3 files changed

+190
-8
lines changed

nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]):
3838

3939
DATA_MODEL = AmdSmiDataModel
4040

41-
L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3 # Thresholds defined in https://ontrack-internal.amd.com/browse/DCGPUSDV-1204, must be greated than this value to generate a error event
42-
L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1 # Thresholds defined in https://ontrack-internal.amd.com/browse/SWLORC-10120, Must be greater than this value to generate a warning event
41+
L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3
42+
L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1
4343

4444
def expected_gpu_processes(
4545
self, processes_data: list[Processes] | None, max_num_processes: int
@@ -65,7 +65,6 @@ def expected_gpu_processes(
6565

6666
process_count = len(process.process_list) # Number of processes for GPU
6767
if process_count > max_num_processes:
68-
# Log an error event if the number of processes is greater than the expected number log event
6968
gpu_exceeds_num_processes[process.gpu] = process_count
7069

7170
if gpu_exceeds_num_processes:

nodescraper/plugins/inband/amdsmi/amdsmi_collector.py

Lines changed: 187 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,14 @@
4848
StaticAsic,
4949
StaticBoard,
5050
StaticBus,
51+
StaticCacheInfoItem,
52+
StaticDriver,
5153
StaticNuma,
54+
StaticPolicy,
55+
StaticSocPstate,
5256
StaticVbios,
5357
StaticVram,
58+
StaticXgmiPlpd,
5459
ValueUnit,
5560
)
5661
from nodescraper.utils import get_exception_details, get_exception_traceback
@@ -537,6 +542,16 @@ def _vu(val: object, unit: str) -> ValueUnit | None:
537542
manufacturer_name=str(board.get("manufacturer_name", "")),
538543
)
539544

545+
# Driver
546+
driver_model = None
547+
drv_fn = getattr(self._amdsmi, "amdsmi_get_gpu_driver_info", None)
548+
if callable(drv_fn):
549+
drv = self._smi_try(drv_fn, h, default={}) or {}
550+
driver_model = StaticDriver(
551+
name=_nz(drv.get("driver_name"), default="unknown"),
552+
version=_nz(drv.get("driver_version"), default="unknown"),
553+
)
554+
540555
# VBIOS
541556
vb = {
542557
k: board[k]
@@ -590,6 +605,10 @@ def _vu(val: object, unit: str) -> ValueUnit | None:
590605
max_bandwidth=None,
591606
)
592607

608+
soc_pstate_model = self._get_soc_pstate(h)
609+
xgmi_plpd_model = self._get_xgmi_plpd(h)
610+
cache_info_model = self._get_cache_info(h)
611+
593612
try:
594613
out.append(
595614
AmdSmiStatic(
@@ -598,15 +617,16 @@ def _vu(val: object, unit: str) -> ValueUnit | None:
598617
bus=bus,
599618
vbios=vbios_model,
600619
limit=None, # not available via API
620+
driver=driver_model,
601621
board=board_model,
602-
soc_pstate=None, # TODO
603-
xgmi_plpd=None, # TODO
622+
soc_pstate=soc_pstate_model,
623+
xgmi_plpd=xgmi_plpd_model,
604624
process_isolation="",
605625
numa=numa_model,
606626
vram=vram_model,
607-
cache_info=[], # TODO
627+
cache_info=cache_info_model,
608628
partition=None,
609-
clock=None, # TODO
629+
clock=None, # TODO amdsmi_get_clk_freq??
610630
)
611631
)
612632
except ValidationError as e:
@@ -619,6 +639,169 @@ def _vu(val: object, unit: str) -> ValueUnit | None:
619639

620640
return out
621641

642+
def _get_soc_pstate(self, h) -> StaticSocPstate | None:
643+
data = self._smi_try(self._amdsmi.amdsmi_get_soc_pstate, h, default=None)
644+
if not isinstance(data, dict):
645+
return None
646+
647+
try:
648+
num_supported = int(data.get("num_supported", 0) or 0)
649+
except Exception:
650+
num_supported = 0
651+
try:
652+
current_id = int(data.get("current_id", 0) or 0)
653+
except Exception:
654+
current_id = 0
655+
656+
policies_raw = data.get("policies") or []
657+
policies: list[StaticPolicy] = []
658+
if isinstance(policies_raw, list):
659+
for p in policies_raw:
660+
if not isinstance(p, dict):
661+
continue
662+
pid = p.get("policy_id", 0)
663+
desc = p.get("policy_description", "")
664+
try:
665+
policies.append(
666+
StaticPolicy(
667+
policy_id=int(pid) if pid not in (None, "") else 0,
668+
policy_description=str(desc),
669+
)
670+
)
671+
except ValidationError:
672+
continue
673+
674+
if not num_supported and not current_id and not policies:
675+
return None
676+
677+
try:
678+
return StaticSocPstate(
679+
num_supported=num_supported,
680+
current_id=current_id,
681+
policies=policies,
682+
)
683+
except ValidationError:
684+
return None
685+
686+
def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None:
687+
data = self._smi_try(self._amdsmi.amdsmi_get_xgmi_plpd, h, default=None)
688+
if not isinstance(data, dict):
689+
return None
690+
691+
try:
692+
num_supported = int(data.get("num_supported", 0) or 0)
693+
except Exception:
694+
num_supported = 0
695+
try:
696+
current_id = int(data.get("current_id", 0) or 0)
697+
except Exception:
698+
current_id = 0
699+
700+
plpds_raw = data.get("plpds") or []
701+
plpds: list[StaticPolicy] = []
702+
if isinstance(plpds_raw, list):
703+
for p in plpds_raw:
704+
if not isinstance(p, dict):
705+
continue
706+
pid = p.get("policy_id", 0)
707+
desc = p.get("policy_description", "")
708+
try:
709+
plpds.append(
710+
StaticPolicy(
711+
policy_id=int(pid) if pid not in (None, "") else 0,
712+
policy_description=str(desc),
713+
)
714+
)
715+
except ValidationError:
716+
continue
717+
718+
if not num_supported and not current_id and not plpds:
719+
return None
720+
721+
try:
722+
return StaticXgmiPlpd(
723+
num_supported=num_supported,
724+
current_id=current_id,
725+
plpds=plpds,
726+
)
727+
except ValidationError:
728+
return None
729+
730+
def _get_cache_info(self, h) -> list[StaticCacheInfoItem]:
731+
"""Map amdsmi_get_gpu_cache_info -> List[StaticCacheInfoItem]."""
732+
raw = self._smi_try(self._amdsmi.amdsmi_get_gpu_cache_info, h, default=None)
733+
if raw is None:
734+
return []
735+
736+
items = raw if isinstance(raw, list) else [raw]
737+
738+
def _to_num(v) -> float | int | None:
739+
if isinstance(v, (int, float)):
740+
return v
741+
if isinstance(v, str):
742+
s = v.strip()
743+
try:
744+
return int(s)
745+
except Exception:
746+
try:
747+
return float(s)
748+
except Exception:
749+
return None
750+
return None
751+
752+
def _vu_req(v) -> ValueUnit:
753+
n = _to_num(v)
754+
return ValueUnit(value=0 if n is None else n, unit="")
755+
756+
def _vu_opt(v) -> ValueUnit | None:
757+
n = _to_num(v)
758+
return None if n is None else ValueUnit(value=n, unit="")
759+
760+
def _as_list_str(v) -> list[str]:
761+
if isinstance(v, list):
762+
return [str(x) for x in v]
763+
if isinstance(v, str):
764+
parts = [p.strip() for p in v.replace(";", ",").split(",")]
765+
return [p for p in parts if p]
766+
return []
767+
768+
out: list[StaticCacheInfoItem] = []
769+
for e in items:
770+
if not isinstance(e, dict):
771+
continue
772+
773+
cache_level = _vu_req(e.get("cache_level"))
774+
max_num_cu_shared = _vu_req(e.get("max_num_cu_shared"))
775+
num_cache_instance = _vu_req(e.get("num_cache_instance"))
776+
cache_size = _vu_opt(e.get("cache_size"))
777+
cache_props = _as_list_str(e.get("cache_properties"))
778+
779+
# AMDSMI doesn’t give a name , "Lable_<level>" as the label???
780+
cache_label_val = f"Lable_{int(cache_level.value) if isinstance(cache_level.value, (int, float)) else cache_level.value}"
781+
cache_label = ValueUnit(value=cache_label_val, unit="")
782+
783+
try:
784+
out.append(
785+
StaticCacheInfoItem(
786+
cache=cache_label,
787+
cache_properties=cache_props,
788+
cache_size=cache_size,
789+
cache_level=cache_level,
790+
max_num_cu_shared=max_num_cu_shared,
791+
num_cache_instance=num_cache_instance,
792+
)
793+
)
794+
except ValidationError as ve:
795+
self._log_event(
796+
category=EventCategory.APPLICATION,
797+
description="Bad cache info entry from AMDSMI; skipping",
798+
data={"entry": repr(e), "exception": get_exception_traceback(ve)},
799+
priority=EventPriority.WARNING,
800+
)
801+
continue
802+
803+
return out
804+
622805
def collect_data(
623806
self,
624807
args=None,

nodescraper/plugins/inband/amdsmi/amdsmidata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ class AmdSmiStatic(BaseModel):
472472
bus: StaticBus
473473
vbios: StaticVbios | None
474474
limit: StaticLimit | None
475-
# driver: StaticDriver
475+
driver: StaticDriver | None
476476
board: StaticBoard
477477
# ras: StaticRas
478478
soc_pstate: StaticSocPstate | None

0 commit comments

Comments
 (0)