Skip to content

Commit 1e456c3

Browse files
committed
updates
1 parent b217c24 commit 1e456c3

File tree

1 file changed

+14
-25
lines changed

1 file changed

+14
-25
lines changed

nodescraper/plugins/inband/amdsmi/amdsmi_collector.py

Lines changed: 14 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,13 @@ def _to_int(x, default=0):
235235
uuid = self._smi_try(amdsmi.amdsmi_get_gpu_device_uuid, h, default="") or ""
236236
kfd = self._smi_try(amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {}
237237

238-
partition_id = 0 # no profile id available yet
238+
kfd = self._smi_try(amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {}
239+
partition_id = 0
240+
if isinstance(kfd, dict):
241+
try:
242+
partition_id = int(kfd.get("current_partition_id", 0) or 0)
243+
except Exception:
244+
partition_id = 0
239245

240246
try:
241247
out.append(
@@ -349,26 +355,12 @@ def get_partition(self) -> Partition | None:
349355
computeparts: list[PartitionCompute] = []
350356

351357
for idx, h in enumerate(devices):
352-
compute_partition = (
353-
self._smi_try(amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {}
354-
)
355-
memory_partition = (
356-
self._smi_try(amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {}
357-
)
358-
359-
mem_pt: Optional[str] = None
360-
if isinstance(memory_partition, dict):
361-
mem_pt = cast(Optional[str], memory_partition.get("partition_type"))
362-
comp_pt: Optional[str] = None
363-
if isinstance(compute_partition, dict):
364-
comp_pt = cast(Optional[str], compute_partition.get("partition_type"))
358+
mem_pt = self._smi_try(amdsmi.amdsmi_get_gpu_memory_partition, h, default=None)
359+
comp_pt = self._smi_try(amdsmi.amdsmi_get_gpu_compute_partition, h, default=None)
365360

366361
try:
367362
memparts.append(
368-
PartitionMemory(
369-
gpu_id=idx,
370-
partition_type=mem_pt,
371-
)
363+
PartitionMemory(gpu_id=idx, partition_type=cast(Optional[str], mem_pt))
372364
)
373365
except ValidationError as e:
374366
self._log_event(
@@ -377,17 +369,14 @@ def get_partition(self) -> Partition | None:
377369
data={
378370
"exception": get_exception_traceback(e),
379371
"gpu_index": idx,
380-
"data": memory_partition,
372+
"data": mem_pt,
381373
},
382374
priority=EventPriority.WARNING,
383375
)
384376

385377
try:
386378
computeparts.append(
387-
PartitionCompute(
388-
gpu_id=idx,
389-
partition_type=comp_pt,
390-
)
379+
PartitionCompute(gpu_id=idx, partition_type=cast(Optional[str], comp_pt))
391380
)
392381
except ValidationError as e:
393382
self._log_event(
@@ -396,7 +385,7 @@ def get_partition(self) -> Partition | None:
396385
data={
397386
"exception": get_exception_traceback(e),
398387
"gpu_index": idx,
399-
"data": compute_partition,
388+
"data": comp_pt,
400389
},
401390
priority=EventPriority.WARNING,
402391
)
@@ -817,7 +806,7 @@ def _as_list_str(v) -> list[str]:
817806

818807
lvl_val = cache_level.value
819808
cache_label_val = (
820-
f"Lable_{int(lvl_val) if isinstance(lvl_val, (int, float)) else lvl_val}"
809+
f"Label_{int(lvl_val) if isinstance(lvl_val, (int, float)) else lvl_val}"
821810
)
822811
cache_label = ValueUnit(value=cache_label_val, unit="")
823812

0 commit comments

Comments
 (0)