Skip to content

Commit c3be354

Browse files
committed
updated partition, and other calls that look slightly differnt
1 parent a8437e4 commit c3be354

File tree

2 files changed

+146
-128
lines changed

2 files changed

+146
-128
lines changed

nodescraper/plugins/inband/amdsmi/amdsmi_collector.py

Lines changed: 139 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,10 @@
2424
#
2525
###############################################################################
2626
import importlib
27-
from typing import cast
2827

2928
from pydantic import ValidationError
3029

3130
from nodescraper.base.inbandcollectortask import InBandDataCollector
32-
from nodescraper.connection.inband.inband import CommandArtifact
3331
from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
3432
from nodescraper.models import TaskResult
3533
from nodescraper.plugins.inband.amdsmi.amdsmidata import (
@@ -66,8 +64,6 @@
6664
class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]):
6765
"""class for collection of inband tool amd-smi data."""
6866

69-
AMD_SMI_EXE = "amd-smi"
70-
7167
SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX}
7268

7369
DATA_MODEL = AmdSmiDataModel
@@ -113,7 +109,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None:
113109
try:
114110
version = self._get_amdsmi_version()
115111
processes = self.get_process()
116-
partition = self._get_partition()
112+
partition = self.get_partition()
117113
firmware = self.get_firmware()
118114
gpu_list = self.get_gpu_list()
119115
statics = self.get_static()
@@ -168,26 +164,6 @@ def _get_amdsmi_version(self) -> AmdSmiVersion | None:
168164
rocm_version=rocm_ver,
169165
)
170166

171-
def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None:
172-
"""Run amd-smi command"""
173-
cmd_ret: CommandArtifact = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}", sudo=sudo)
174-
175-
if cmd_ret.exit_code != 0:
176-
self._log_event(
177-
category=EventCategory.APPLICATION,
178-
description="Error running amd-smi command",
179-
data={
180-
"command": cmd,
181-
"exit_code": cmd_ret.exit_code,
182-
"stderr": cmd_ret.stderr,
183-
},
184-
priority=EventPriority.ERROR,
185-
console_log=True,
186-
)
187-
return None
188-
189-
return cmd_ret.stdout or ""
190-
191167
def get_gpu_list(self) -> list[AmdSmiListItem] | None:
192168
devices = self._get_handles()
193169
out: list[AmdSmiListItem] = []
@@ -238,48 +214,68 @@ def _to_int(x, default=0):
238214
def get_process(self) -> list[Processes] | None:
239215
devices = self._get_handles()
240216
out: list[Processes] = []
217+
241218
for idx, h in enumerate(devices):
242219
try:
243-
pids = self._amdsmi.amdsmi_get_gpu_process_list(h) or []
220+
raw_list = (
221+
self._smi_try(self._amdsmi.amdsmi_get_gpu_process_list, h, default=[]) or []
222+
)
244223
plist: list[ProcessListItem] = []
245224

246-
for pid in pids:
247-
pinfo = self._smi_try(
248-
self._amdsmi.amdsmi_get_gpu_compute_process_info, h, pid, default=None
249-
)
250-
if not isinstance(pinfo, dict):
251-
plist.append(ProcessListItem(process_info=str(pid)))
225+
for entry in raw_list:
226+
if not isinstance(entry, dict):
227+
plist.append(ProcessListItem(process_info=str(entry)))
252228
continue
253229

254-
plist.append(
255-
ProcessListItem(
256-
process_info=cast(
257-
ProcessInfo,
258-
{
259-
"name": pinfo.get("name", str(pid)),
260-
"pid": int(pid),
261-
"memory_usage": {
262-
"gtt_mem": ValueUnit(
263-
value=pinfo.get("gtt_mem", 0), unit="B"
264-
),
265-
"cpu_mem": ValueUnit(
266-
value=pinfo.get("cpu_mem", 0), unit="B"
267-
),
268-
"vram_mem": ValueUnit(
269-
value=pinfo.get("vram_mem", 0), unit="B"
270-
),
271-
},
272-
"mem_usage": ValueUnit(
273-
value=pinfo.get("vram_mem", 0), unit="B"
274-
),
275-
"usage": {
276-
"gfx": ValueUnit(value=pinfo.get("gfx", 0), unit="%"),
277-
"enc": ValueUnit(value=pinfo.get("enc", 0), unit="%"),
278-
},
279-
},
230+
name = entry.get("name", "N/A")
231+
pid_val = entry.get("pid", 0)
232+
try:
233+
pid = int(pid_val) if pid_val not in (None, "") else 0
234+
except Exception:
235+
pid = 0
236+
237+
mem_vu = self._vu(entry.get("mem"), "B")
238+
mu = entry.get("memory_usage") or {}
239+
mem_usage = {
240+
"gtt_mem": self._vu(mu.get("gtt_mem"), "B"),
241+
"cpu_mem": self._vu(mu.get("cpu_mem"), "B"),
242+
"vram_mem": self._vu(mu.get("vram_mem"), "B"),
243+
}
244+
245+
eu = entry.get("engine_usage") or {}
246+
usage = {
247+
"gfx": self._vu(eu.get("gfx"), "ns"),
248+
"enc": self._vu(eu.get("enc"), "ns"),
249+
}
250+
251+
cu_occ = self._vu(entry.get("cu_occupancy"), "")
252+
253+
try:
254+
plist.append(
255+
ProcessListItem(
256+
process_info=ProcessInfo(
257+
name=str(name),
258+
pid=pid,
259+
mem=mem_vu,
260+
memory_usage=mem_usage,
261+
usage=usage,
262+
cu_occupancy=cu_occ,
263+
)
280264
)
281265
)
282-
)
266+
except ValidationError as e:
267+
self._log_event(
268+
category=EventCategory.APPLICATION,
269+
description="Failed to build ProcessListItem; skipping entry",
270+
data={
271+
"exception": get_exception_traceback(e),
272+
"gpu_index": idx,
273+
"entry": repr(entry),
274+
},
275+
priority=EventPriority.WARNING,
276+
)
277+
continue
278+
283279
try:
284280
out.append(Processes(gpu=idx, process_list=plist))
285281
except ValidationError as e:
@@ -296,36 +292,71 @@ def get_process(self) -> list[Processes] | None:
296292
data={"exception": get_exception_traceback(e), "gpu_index": idx},
297293
priority=EventPriority.WARNING,
298294
)
295+
299296
return out
300297

301-
def _get_partition(self) -> Partition | None:
298+
def get_partition(self) -> Partition | None:
302299
devices = self._get_handles()
303300
current: list[PartitionCurrent] = []
304301
memparts: list[PartitionMemory] = []
305-
resources: list[dict] = [] # keep as-is if your model allows
302+
resources: list[dict] = []
306303

307304
for idx, h in enumerate(devices):
305+
# compute
308306
c = self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {}
309-
m = self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {}
310307
c_dict = c if isinstance(c, dict) else {}
308+
309+
# memory
310+
m = self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {}
311311
m_dict = m if isinstance(m, dict) else {}
312312

313-
current.append(
314-
PartitionCurrent(
315-
gpu_id=idx,
316-
memory=c_dict.get("memory"),
317-
accelerator_type=c_dict.get("accelerator_type"),
318-
accelerator_profile_index=c_dict.get("accelerator_profile_index"),
319-
partition_id=c_dict.get("partition_id"),
313+
prof_list: list[dict] = (
314+
[]
315+
) # amdsmi_get_gpu_accelerator_partition_profile -> currently not supported
316+
317+
try:
318+
current.append(
319+
PartitionCurrent(
320+
gpu_id=idx,
321+
memory=c_dict.get("memory"),
322+
accelerator_type=c_dict.get("accelerator_type"),
323+
accelerator_profile_index=c_dict.get("accelerator_profile_index"),
324+
partition_id=c_dict.get("partition_id"),
325+
)
320326
)
321-
)
322-
memparts.append(
323-
PartitionMemory(
324-
gpu_id=idx,
325-
memory_partition_caps=m_dict.get("memory_partition_caps"),
326-
current_partition_id=m_dict.get("current_partition_id"),
327+
except ValidationError as e:
328+
self._log_event(
329+
category=EventCategory.APPLICATION,
330+
description="Failed to build PartitionCurrent",
331+
data={
332+
"exception": get_exception_traceback(e),
333+
"gpu_index": idx,
334+
"data": c_dict,
335+
},
336+
priority=EventPriority.WARNING,
327337
)
328-
)
338+
339+
try:
340+
memparts.append(
341+
PartitionMemory(
342+
gpu_id=idx,
343+
memory_partition_caps=m_dict.get("memory_partition_caps"),
344+
current_partition_id=m_dict.get("current_partition_id"),
345+
)
346+
)
347+
except ValidationError as e:
348+
self._log_event(
349+
category=EventCategory.APPLICATION,
350+
description="Failed to build PartitionMemory",
351+
data={
352+
"exception": get_exception_traceback(e),
353+
"gpu_index": idx,
354+
"data": m_dict,
355+
},
356+
priority=EventPriority.WARNING,
357+
)
358+
359+
resources.append({"gpu_id": idx, "profiles": []})
329360

330361
try:
331362
return Partition(
@@ -461,21 +492,6 @@ def _nz(val: object, default: str = "unknown") -> str:
461492
s = str(val).strip() if val is not None else ""
462493
return s if s and s.upper() != "N/A" else default
463494

464-
def _vu(val: object, unit: str) -> ValueUnit | None:
465-
"""Build ValueUnit from mixed numeric/string input, else None."""
466-
if val in (None, "", "N/A"):
467-
return None
468-
try:
469-
if isinstance(val, str):
470-
v = float(val) if any(ch in val for ch in ".eE") else int(val)
471-
elif isinstance(val, float):
472-
v = val
473-
else:
474-
v = int(val)
475-
except Exception:
476-
return None
477-
return ValueUnit(value=v, unit=unit)
478-
479495
pcie_fn = getattr(self._amdsmi, "amdsmi_get_pcie_info", None)
480496

481497
out: list[AmdSmiStatic] = []
@@ -496,8 +512,8 @@ def _vu(val: object, unit: str) -> ValueUnit | None:
496512
pcie_ver = p.get("pcie_version") or p.get("pcie_interface_version")
497513
bus = StaticBus(
498514
bdf=bdf,
499-
max_pcie_width=_vu(max_w, "x"),
500-
max_pcie_speed=_vu(max_s, "GT/s"),
515+
max_pcie_width=self._vu(max_w, "x"),
516+
max_pcie_speed=self._vu(max_s, "GT/s"),
501517
pcie_interface_version=_nz(pcie_ver),
502518
slot_type=_nz(p.get("slot_type")),
503519
)
@@ -602,8 +618,8 @@ def _vu(val: object, unit: str) -> ValueUnit | None:
602618
vram_model = StaticVram(
603619
type=vram_type,
604620
vendor=None if vram_vendor in (None, "", "N/A") else str(vram_vendor),
605-
size=_vu(vram_size_b, "B"),
606-
bit_width=_vu(vram_bits, "bit"),
621+
size=self._vu(vram_size_b, "B"),
622+
bit_width=self._vu(vram_bits, "bit"),
607623
max_bandwidth=None,
608624
)
609625

@@ -757,28 +773,6 @@ def _get_cache_info(self, h) -> list[StaticCacheInfoItem]:
757773

758774
items = raw if isinstance(raw, list) else [raw]
759775

760-
def _to_num(v) -> float | int | None:
761-
if isinstance(v, (int, float)):
762-
return v
763-
if isinstance(v, str):
764-
s = v.strip()
765-
try:
766-
return int(s)
767-
except Exception:
768-
try:
769-
return float(s)
770-
except Exception:
771-
return None
772-
return None
773-
774-
def _vu_req(v) -> ValueUnit:
775-
n = _to_num(v)
776-
return ValueUnit(value=0 if n is None else n, unit="")
777-
778-
def _vu_opt(v) -> ValueUnit | None:
779-
n = _to_num(v)
780-
return None if n is None else ValueUnit(value=n, unit="")
781-
782776
def _as_list_str(v) -> list[str]:
783777
if isinstance(v, list):
784778
return [str(x) for x in v]
@@ -792,10 +786,10 @@ def _as_list_str(v) -> list[str]:
792786
if not isinstance(e, dict):
793787
continue
794788

795-
cache_level = _vu_req(e.get("cache_level"))
796-
max_num_cu_shared = _vu_req(e.get("max_num_cu_shared"))
797-
num_cache_instance = _vu_req(e.get("num_cache_instance"))
798-
cache_size = _vu_opt(e.get("cache_size"))
789+
cache_level = self._vu(e.get("cache_level"), "", required=True)
790+
max_num_cu_shared = self._vu(e.get("max_num_cu_shared"), "", required=True)
791+
num_cache_instance = self._vu(e.get("num_cache_instance"), "", required=True)
792+
cache_size = self._vu(e.get("cache_size"), "", required=False)
799793
cache_props = _as_list_str(e.get("cache_properties"))
800794

801795
# AMDSMI doesn’t give a name , "Lable_<level>" as the label???
@@ -824,10 +818,8 @@ def _as_list_str(v) -> list[str]:
824818

825819
return out
826820

827-
828821
def _get_clock(self, h) -> StaticClockData | None:
829-
"""
830-
"""
822+
""" """
831823
fn = getattr(self._amdsmi, "amdsmi_get_clk_freq", None)
832824
clk_type = getattr(self._amdsmi, "AmdSmiClkType", None)
833825
if not callable(fn) or clk_type is None or not hasattr(clk_type, "SYS"):
@@ -868,8 +860,6 @@ def _fmt(n: int | None) -> str | None:
868860
except ValidationError:
869861
return None
870862

871-
872-
873863
def collect_data(
874864
self,
875865
args=None,
@@ -902,3 +892,26 @@ def collect_data(
902892
self._amdsmi.amdsmi_shut_down()
903893
except Exception:
904894
pass
895+
896+
def _vu(self, v: object, unit: str, *, required: bool = False) -> ValueUnit | None:
897+
"""
898+
Build ValueUnit from mixed numeric/string input.
899+
Returns:
900+
None for None/''/'N/A' unless required=True, in which case ValueUnit(0, unit).
901+
"""
902+
if v in (None, "", "N/A"):
903+
return ValueUnit(value=0, unit=unit) if required else None
904+
try:
905+
if isinstance(v, str):
906+
s = v.strip()
907+
try:
908+
n = int(s)
909+
except Exception:
910+
n = float(s)
911+
elif isinstance(v, (int, float)):
912+
n = v
913+
else:
914+
n = int(v)
915+
except Exception:
916+
return ValueUnit(value=0, unit=unit) if required else None
917+
return ValueUnit(value=n, unit=unit)

0 commit comments

Comments
 (0)