Skip to content

Commit 9402695

Browse files
committed
fix for when amd-smi reports extra fields
1 parent 7e5830e commit 9402695

File tree

1 file changed

+37
-3
lines changed

1 file changed

+37
-3
lines changed

nodescraper/plugins/inband/amdsmi/amdsmi_collector.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,15 +100,33 @@ def _run_amd_smi(self, cmd: str) -> Optional[str]:
100100
"""
101101
cmd_ret = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}")
102102

103-
# Check for known warnings that can be ignored
103+
# Check for known warnings and errors that can be handled
104104
is_group_warning = (
105105
"User is missing the following required groups" in cmd_ret.stderr
106106
or "User is missing the following required groups" in cmd_ret.stdout
107107
)
108108

109+
# Check for known amd-smi internal bugs
110+
is_amdsmi_internal_error = any(
111+
pattern in cmd_ret.stderr for pattern in ["KeyError:", "AttributeError:", "IndexError:"]
112+
)
113+
109114
# Log warning if user is missing group
110115
if cmd_ret.stderr != "" or cmd_ret.exit_code != 0:
111-
if not is_group_warning:
116+
if is_amdsmi_internal_error:
117+
self._log_event(
118+
category=EventCategory.SW_DRIVER,
119+
description="amd-smi internal error detected",
120+
data={
121+
"command": cmd,
122+
"exit_code": cmd_ret.exit_code,
123+
"stderr": cmd_ret.stderr,
124+
},
125+
priority=EventPriority.WARNING,
126+
console_log=True,
127+
)
128+
return None
129+
elif not is_group_warning:
112130
self._log_event(
113131
category=EventCategory.APPLICATION,
114132
description="Error running amd-smi command",
@@ -595,7 +613,23 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]:
595613
"""
596614
ret = self._run_amd_smi_dict("static -g all")
597615
if not ret:
598-
return []
616+
self.logger.info("Bulk static query failed, attempting per-GPU fallback")
617+
gpu_list = self.get_gpu_list()
618+
if gpu_list:
619+
fallback_data: list[dict] = []
620+
for gpu in gpu_list:
621+
gpu_data = self._run_amd_smi_dict(f"static -g {gpu.gpu}")
622+
if gpu_data:
623+
if isinstance(gpu_data, dict):
624+
fallback_data.append(gpu_data)
625+
elif isinstance(gpu_data, list):
626+
fallback_data.extend(gpu_data)
627+
if fallback_data:
628+
ret = fallback_data
629+
else:
630+
return []
631+
else:
632+
return []
599633

600634
if isinstance(ret, dict) and "gpu_data" in ret:
601635
ret = ret["gpu_data"]

0 commit comments

Comments
 (0)