Skip to content

Commit 688deee

Browse files
Jaspal SinghJaspal Singh
authored andcommitted
addressed review comments
1 parent 995025f commit 688deee

File tree

4 files changed

+93
-160
lines changed

4 files changed

+93
-160
lines changed

nodescraper/plugins/inband/rocm/rocm_analyzer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def analyze_data(
7777
)
7878
return self.result
7979

80-
# validate rocm_latest if provided in args (only if version check passed)
80+
# validate rocm_latest if provided in args
8181
if args.exp_rocm_latest:
8282
if data.rocm_latest_versioned_path != args.exp_rocm_latest:
8383
self.result.message = f"ROCm latest path mismatch! Expected: {args.exp_rocm_latest}, actual: {data.rocm_latest_versioned_path}"

nodescraper/plugins/inband/rocm/rocm_collector.py

Lines changed: 43 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,17 @@
3030
from nodescraper.connection.inband import TextFileArtifact
3131
from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
3232
from nodescraper.models import TaskResult
33+
from nodescraper.utils import (
34+
CMD_CLINFO,
35+
CMD_ENV_VARS,
36+
CMD_KFD_PROC,
37+
CMD_LD_CONF,
38+
CMD_ROCM_DIRS,
39+
CMD_ROCM_LATEST,
40+
CMD_ROCM_LIBS,
41+
CMD_ROCMINFO,
42+
CMD_VERSION_PATHS,
43+
)
3344

3445
from .rocmdata import RocmDataModel
3546

@@ -40,18 +51,6 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]):
4051
SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX}
4152

4253
DATA_MODEL = RocmDataModel
43-
CMD_VERSION_PATHS = [
44-
"/opt/rocm/.info/version-rocm",
45-
"/opt/rocm/.info/version",
46-
]
47-
CMD_ROCMINFO = "{rocm_path}/bin/rocminfo"
48-
CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1"
49-
CMD_ROCM_DIRS = "ls -v -d /opt/rocm*"
50-
CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*"
51-
CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'"
52-
CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'"
53-
CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo"
54-
CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/"
5554

5655
@staticmethod
5756
def _strip_ansi_codes(text: str) -> str:
@@ -73,42 +72,49 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
7372
Returns:
7473
tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available.
7574
"""
76-
version_paths = [
77-
"/opt/rocm/.info/version-rocm",
78-
"/opt/rocm/.info/version",
79-
]
80-
8175
rocm_data = None
82-
for path in self.CMD_VERSION_PATHS:
76+
for path in CMD_VERSION_PATHS:
8377
res = self._run_sut_cmd(f"grep . {path}")
8478
if res.exit_code == 0:
85-
rocm_data = RocmDataModel(rocm_version=res.stdout)
86-
self._log_event(
87-
category="ROCM_VERSION_READ",
88-
description="ROCm version data collected",
89-
data=rocm_data.model_dump(include={"rocm_version"}),
90-
priority=EventPriority.INFO,
91-
)
92-
self.result.message = f"ROCm version: {rocm_data.rocm_version}"
93-
self.result.status = ExecutionStatus.OK
94-
break
79+
try:
80+
rocm_data = RocmDataModel(rocm_version=res.stdout)
81+
self._log_event(
82+
category="ROCM_VERSION_READ",
83+
description="ROCm version data collected",
84+
data=rocm_data.model_dump(include={"rocm_version"}),
85+
priority=EventPriority.INFO,
86+
)
87+
self.result.message = f"ROCm version: {rocm_data.rocm_version}"
88+
self.result.status = ExecutionStatus.OK
89+
break
90+
except ValueError as e:
91+
self._log_event(
92+
category=EventCategory.OS,
93+
description=f"Invalid ROCm version format: {res.stdout}",
94+
data={"version": res.stdout, "error": str(e)},
95+
priority=EventPriority.ERROR,
96+
console_log=True,
97+
)
98+
self.result.message = f"Invalid ROCm version format: {res.stdout}"
99+
self.result.status = ExecutionStatus.ERROR
100+
return self.result, None
95101
else:
96102
self._log_event(
97103
category=EventCategory.OS,
98-
description=f"Unable to read ROCm version from {version_paths}",
104+
description=f"Unable to read ROCm version from {CMD_VERSION_PATHS}",
99105
data={"raw_output": res.stdout},
100106
priority=EventPriority.ERROR,
101107
)
102108

103109
# Collect additional ROCm data if version was found
104110
if rocm_data:
105111
# Collect latest versioned ROCm path (rocm-[3-7]*)
106-
versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_LATEST)
112+
versioned_path_res = self._run_sut_cmd(CMD_ROCM_LATEST)
107113
if versioned_path_res.exit_code == 0:
108114
rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip()
109115

110116
# Collect all ROCm paths as list
111-
all_paths_res = self._run_sut_cmd(self.CMD_ROCM_DIRS)
117+
all_paths_res = self._run_sut_cmd(CMD_ROCM_DIRS)
112118
if all_paths_res.exit_code == 0:
113119
rocm_data.rocm_all_paths = [
114120
path.strip()
@@ -120,7 +126,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
120126
rocm_path = rocm_data.rocm_latest_versioned_path or "/opt/rocm"
121127

122128
# Collect rocminfo output as list of lines with ANSI codes stripped
123-
rocminfo_cmd = self.CMD_ROCMINFO.format(rocm_path=rocm_path)
129+
rocminfo_cmd = CMD_ROCMINFO.format(rocm_path=rocm_path)
124130
rocminfo_res = self._run_sut_cmd(rocminfo_cmd)
125131
rocminfo_artifact_content = ""
126132
if rocminfo_res.exit_code == 0:
@@ -134,14 +140,14 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
134140
rocminfo_artifact_content += rocminfo_res.stdout
135141

136142
# Collect ld.so.conf ROCm entries
137-
ld_conf_res = self._run_sut_cmd(self.CMD_LD_CONF)
143+
ld_conf_res = self._run_sut_cmd(CMD_LD_CONF)
138144
if ld_conf_res.exit_code == 0:
139145
rocm_data.ld_conf_rocm = [
140146
line.strip() for line in ld_conf_res.stdout.strip().split("\n") if line.strip()
141147
]
142148

143149
# Collect ROCm libraries from ldconfig
144-
rocm_libs_res = self._run_sut_cmd(self.CMD_ROCM_LIBS)
150+
rocm_libs_res = self._run_sut_cmd(CMD_ROCM_LIBS)
145151
if rocm_libs_res.exit_code == 0:
146152
rocm_data.rocm_libs = [
147153
line.strip()
@@ -150,14 +156,14 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
150156
]
151157

152158
# Collect ROCm-related environment variables
153-
env_vars_res = self._run_sut_cmd(self.CMD_ENV_VARS)
159+
env_vars_res = self._run_sut_cmd(CMD_ENV_VARS)
154160
if env_vars_res.exit_code == 0:
155161
rocm_data.env_vars = [
156162
line.strip() for line in env_vars_res.stdout.strip().split("\n") if line.strip()
157163
]
158164

159165
# Collect clinfo output
160-
clinfo_cmd = self.CMD_CLINFO.format(rocm_path=rocm_path)
166+
clinfo_cmd = CMD_CLINFO.format(rocm_path=rocm_path)
161167
clinfo_res = self._run_sut_cmd(clinfo_cmd)
162168

163169
# Always append clinfo section to artifact, even if empty or failed
@@ -188,7 +194,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
188194
)
189195

190196
# Collect KFD process list
191-
kfd_proc_res = self._run_sut_cmd(self.CMD_KFD_PROC)
197+
kfd_proc_res = self._run_sut_cmd(CMD_KFD_PROC)
192198
if kfd_proc_res.exit_code == 0:
193199
rocm_data.kfd_proc = [
194200
proc.strip() for proc in kfd_proc_res.stdout.strip().split("\n") if proc.strip()

nodescraper/utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,3 +201,18 @@ def nice_rotated_name(path: str, stem: str, prefix: str = "rotated_") -> str:
201201

202202
middle = base[:-3] if base.endswith(".gz") else base
203203
return f"{prefix}{middle}.log"
204+
205+
206+
# ROCm Plugin Command Constants
207+
CMD_VERSION_PATHS = [
208+
"/opt/rocm/.info/version-rocm",
209+
"/opt/rocm/.info/version",
210+
]
211+
CMD_ROCMINFO = "{rocm_path}/bin/rocminfo"
212+
CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1"
213+
CMD_ROCM_DIRS = "ls -v -d /opt/rocm*"
214+
CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*"
215+
CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'"
216+
CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'"
217+
CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo"
218+
CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/"

0 commit comments

Comments
 (0)