Skip to content

Commit 7faf0f3

Browse files
committed
fixed partition(compute,gpu), static needs work
1 parent c3be354 commit 7faf0f3

File tree

2 files changed

+82
-98
lines changed

2 files changed

+82
-98
lines changed

nodescraper/plugins/inband/amdsmi/amdsmi_collector.py

Lines changed: 58 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
Fw,
3939
FwListItem,
4040
Partition,
41-
PartitionCurrent,
41+
PartitionCompute,
4242
PartitionMemory,
4343
Processes,
4444
ProcessInfo,
@@ -179,16 +179,9 @@ def _to_int(x, default=0):
179179
uuid = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_uuid, h, default="") or ""
180180
kfd = self._smi_try(self._amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {}
181181

182+
# partition is will be supported in amdsmi_get_gpu_accelerator_partition_profile.
183+
# Currently returns hardcoded empty values
182184
partition_id = 0
183-
cp = self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {}
184-
if isinstance(cp, dict) and cp.get("partition_id") is not None:
185-
partition_id = _to_int(cp.get("partition_id"), 0)
186-
else:
187-
mp = (
188-
self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {}
189-
)
190-
if isinstance(mp, dict) and mp.get("current_partition_id") is not None:
191-
partition_id = _to_int(mp.get("current_partition_id"), 0)
192185

193186
try:
194187
out.append(
@@ -297,73 +290,62 @@ def get_process(self) -> list[Processes] | None:
297290

298291
def get_partition(self) -> Partition | None:
299292
devices = self._get_handles()
300-
current: list[PartitionCurrent] = []
301293
memparts: list[PartitionMemory] = []
302-
resources: list[dict] = []
294+
computeparts: list[PartitionCompute] = []
303295

304296
for idx, h in enumerate(devices):
305297
# compute
306-
c = self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {}
307-
c_dict = c if isinstance(c, dict) else {}
298+
compute_partition = (
299+
self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {}
300+
)
308301

309302
# memory
310-
m = self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {}
311-
m_dict = m if isinstance(m, dict) else {}
303+
memory_partition = (
304+
self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {}
305+
)
312306

313-
prof_list: list[dict] = (
314-
[]
315-
) # amdsmi_get_gpu_accelerator_partition_profile -> currently not supported
307+
# accelerator partion currently hardcoded to compty values in API
316308

317309
try:
318-
current.append(
319-
PartitionCurrent(
310+
memparts.append(
311+
PartitionMemory(
320312
gpu_id=idx,
321-
memory=c_dict.get("memory"),
322-
accelerator_type=c_dict.get("accelerator_type"),
323-
accelerator_profile_index=c_dict.get("accelerator_profile_index"),
324-
partition_id=c_dict.get("partition_id"),
313+
partition_type=memory_partition,
325314
)
326315
)
327316
except ValidationError as e:
328317
self._log_event(
329318
category=EventCategory.APPLICATION,
330-
description="Failed to build PartitionCurrent",
319+
description="Failed to build PartitionMemory",
331320
data={
332321
"exception": get_exception_traceback(e),
333322
"gpu_index": idx,
334-
"data": c_dict,
323+
"data": memory_partition,
335324
},
336325
priority=EventPriority.WARNING,
337326
)
338327

339328
try:
340-
memparts.append(
341-
PartitionMemory(
329+
computeparts.append(
330+
PartitionCompute(
342331
gpu_id=idx,
343-
memory_partition_caps=m_dict.get("memory_partition_caps"),
344-
current_partition_id=m_dict.get("current_partition_id"),
332+
partition_type=compute_partition,
345333
)
346334
)
347335
except ValidationError as e:
348336
self._log_event(
349337
category=EventCategory.APPLICATION,
350-
description="Failed to build PartitionMemory",
338+
description="Failed to build PartitionCompute",
351339
data={
352340
"exception": get_exception_traceback(e),
353341
"gpu_index": idx,
354-
"data": m_dict,
342+
"data": compute_partition,
355343
},
356344
priority=EventPriority.WARNING,
357345
)
358346

359-
resources.append({"gpu_id": idx, "profiles": []})
360-
361347
try:
362-
return Partition(
363-
current_partition=current,
364-
memory_partition=memparts,
365-
partition_resources=resources,
366-
)
348+
return Partition(memory_partition=memparts, compute_partition=computeparts)
367349
except ValidationError as e:
368350
self._log_event(
369351
category=EventCategory.APPLICATION,
@@ -382,38 +364,19 @@ def get_firmware(self) -> list[Fw] | None:
382364
if raw is None:
383365
continue
384366

385-
if isinstance(raw, list):
386-
items = raw
387-
elif isinstance(raw, dict):
388-
if isinstance(raw.get("fw_list"), list):
389-
items = raw["fw_list"]
390-
elif raw and all(not isinstance(v, (dict, list, tuple)) for v in raw.values()):
391-
items = [{"fw_id": k, "fw_version": v} for k, v in raw.items()]
392-
else:
393-
items = [raw]
394-
else:
395-
items = []
367+
items = raw["fw_list"]
396368

397369
normalized: list[FwListItem] = []
398370
for e in items:
399371
if isinstance(e, dict):
400-
fid = (
401-
e.get("fw_id")
402-
or e.get("fw_name")
403-
or e.get("name")
404-
or e.get("block")
405-
or e.get("type")
406-
or e.get("id")
407-
)
408-
ver = e.get("fw_version") or e.get("version") or e.get("fw_ver") or e.get("ver")
372+
fid = e.get("fw_name")
373+
ver = e.get("fw_version")
409374
normalized.append(
410375
FwListItem(
411-
fw_id="" if fid is None else str(fid),
376+
fw_name="" if fid is None else str(fid),
412377
fw_version="" if ver is None else str(ver),
413378
)
414379
)
415-
elif isinstance(e, (tuple, list)) and len(e) >= 2:
416-
normalized.append(FwListItem(fw_id=str(e[0]), fw_version=str(e[1])))
417380
else:
418381
self._log_event(
419382
category=EventCategory.APPLICATION,
@@ -487,11 +450,6 @@ def get_static(self) -> list[AmdSmiStatic] | None:
487450
if not devices:
488451
return []
489452

490-
def _nz(val: object, default: str = "unknown") -> str:
491-
"""Normalize possibly-empty/NA strings to a non-empty default."""
492-
s = str(val).strip() if val is not None else ""
493-
return s if s and s.upper() != "N/A" else default
494-
495453
pcie_fn = getattr(self._amdsmi, "amdsmi_get_pcie_info", None)
496454

497455
out: list[AmdSmiStatic] = []
@@ -507,36 +465,36 @@ def _nz(val: object, default: str = "unknown") -> str:
507465
if callable(pcie_fn):
508466
p = self._smi_try(pcie_fn, h, default={}) or {}
509467
if isinstance(p, dict):
510-
max_w = p.get("max_link_width")
511-
max_s = p.get("max_link_speed")
512-
pcie_ver = p.get("pcie_version") or p.get("pcie_interface_version")
468+
max_w = p.get("max_pcie_width")
469+
max_s = p.get("max_pcie_speed")
470+
pcie_ver = p.get("pcie_interface_version")
513471
bus = StaticBus(
514472
bdf=bdf,
515473
max_pcie_width=self._vu(max_w, "x"),
516474
max_pcie_speed=self._vu(max_s, "GT/s"),
517-
pcie_interface_version=_nz(pcie_ver),
518-
slot_type=_nz(p.get("slot_type")),
475+
pcie_interface_version=self._nz(pcie_ver),
476+
slot_type=self._nz(p.get("slot_type"), slot_type=True),
519477
)
520478
else:
521479
bus = StaticBus(
522480
bdf=bdf,
523481
max_pcie_width=None,
524482
max_pcie_speed=None,
525483
pcie_interface_version="unknown",
526-
slot_type="unknown",
484+
slot_type="Unknown",
527485
)
528486
else:
529487
bus = StaticBus(
530488
bdf=bdf,
531489
max_pcie_width=None,
532490
max_pcie_speed=None,
533491
pcie_interface_version="unknown",
534-
slot_type="unknown",
492+
slot_type="Unknown",
535493
)
536494

537495
# ASIC
538496
asic_model = StaticAsic(
539-
market_name=_nz(asic.get("market_name") or asic.get("asic_name"), default=""),
497+
market_name=self._nz(asic.get("market_name") or asic.get("asic_name"), default=""),
540498
vendor_id=str(asic.get("vendor_id", "")),
541499
vendor_name=str(asic.get("vendor_name", "")),
542500
subvendor_id=str(asic.get("subvendor_id", "")),
@@ -566,8 +524,8 @@ def _nz(val: object, default: str = "unknown") -> str:
566524
if callable(drv_fn):
567525
drv = self._smi_try(drv_fn, h, default={}) or {}
568526
driver_model = StaticDriver(
569-
name=_nz(drv.get("driver_name"), default="unknown"),
570-
version=_nz(drv.get("driver_version"), default="unknown"),
527+
name=self._nz(drv.get("driver_name"), default="unknown"),
528+
version=self._nz(drv.get("driver_version"), default="unknown"),
571529
)
572530

573531
# VBIOS
@@ -915,3 +873,25 @@ def _vu(self, v: object, unit: str, *, required: bool = False) -> ValueUnit | No
915873
except Exception:
916874
return ValueUnit(value=0, unit=unit) if required else None
917875
return ValueUnit(value=n, unit=unit)
876+
877+
def _nz(self, val: object, default: str = "unknown", *, slot_type: bool = False) -> str:
878+
"""
879+
Normalize strings:
880+
- Generic: return trimmed value unless empty/'N/A', else `default`.
881+
- slot_type=True: map to one of {'OAM','PCIE','CEM','Unknown'}.
882+
"""
883+
s = str(val).strip() if val is not None else ""
884+
if not s or s.upper() == "N/A":
885+
return "Unknown" if slot_type else default
886+
887+
if slot_type:
888+
u = s.upper().replace(" ", "").replace("-", "")
889+
if u == "OAM":
890+
return "OAM"
891+
if u in {"PCIE", "PCIEXPRESS", "PCIEXP"} or u.startswith("PCIE"):
892+
return "PCIE"
893+
if u == "CEM":
894+
return "CEM"
895+
return "Unknown"
896+
897+
return s

nodescraper/plugins/inband/amdsmi/amdsmidata.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -233,8 +233,8 @@ class Processes(BaseModel):
233233

234234
# FW
235235
class FwListItem(BaseModel):
236-
fw_id: str
237236
fw_version: str
237+
fw_name: str
238238

239239

240240
class Fw(BaseModel):
@@ -273,8 +273,8 @@ def _stringify(cls, v):
273273
return str(v)
274274

275275

276-
class PartitionCurrent(BaseModel):
277-
"""Contains the Current Partition data for the GPUs"""
276+
class PartitionAccelerator(BaseModel):
277+
"""Contains the tition data for the GPUs"""
278278

279279
gpu_id: int
280280
memory: str | None = None
@@ -287,33 +287,37 @@ class PartitionMemory(BaseModel):
287287
"""Memory Partition data"""
288288

289289
gpu_id: int
290-
memory_partition_caps: str | None = None
291-
current_partition_id: str | None = None
290+
partition_type: str | None = None
292291

293292

294-
class PartitionProfiles(AmdSmiBaseModel):
295-
"""Partition Profiles data"""
293+
class PartitionCompute(BaseModel):
294+
"""Compute Partition data"""
296295

297296
gpu_id: int
298-
profile_index: str | None = None
299-
memory_partition_caps: str | None = None
300-
accelerator_type: str | None = None
301-
partition_id: str | None = None
302-
num_partitions: str | None = None
303-
num_resources: str | None = None
304-
resource_index: str | None = None
305-
resource_type: str | None = None
306-
resource_instances: str | None = None
307-
resources_shared: str | None = None
297+
partition_type: str | None = None
298+
299+
300+
# class PartitionProfiles(AmdSmiBaseModel):
301+
# """Partition Profiles data"""
302+
#
303+
# gpu_id: int
304+
# profile_index: str | None = None
305+
# memory_partition_caps: str | None = None
306+
# accelerator_type: str | None = None
307+
# partition_id: str | None = None
308+
# num_partitions: str | None = None
309+
# num_resources: str | None = None
310+
# resource_index: str | None = None
311+
# resource_type: str | None = None
312+
# resource_instances: str | None = None
313+
# resources_shared: str | None = None
308314

309315

310316
class Partition(BaseModel):
311317
"""Contains the partition info for amd-smi"""
312318

313-
current_partition: list[PartitionCurrent] = Field(default_factory=list)
314319
memory_partition: list[PartitionMemory] = Field(default_factory=list)
315-
partition_profiles: list[dict] = Field(default_factory=list)
316-
partition_resources: list[dict] = Field(default_factory=list)
320+
compute_partition: list[PartitionCompute] = Field(default_factory=list)
317321

318322

319323
### STATIC DATA ###

0 commit comments

Comments
 (0)