3030from nodescraper .connection .inband import TextFileArtifact
3131from nodescraper .enums import EventCategory , EventPriority , ExecutionStatus , OSFamily
3232from nodescraper .models import TaskResult
33+ from nodescraper .utils import (
34+ CMD_CLINFO ,
35+ CMD_ENV_VARS ,
36+ CMD_KFD_PROC ,
37+ CMD_LD_CONF ,
38+ CMD_ROCM_DIRS ,
39+ CMD_ROCM_LATEST ,
40+ CMD_ROCM_LIBS ,
41+ CMD_ROCMINFO ,
42+ CMD_VERSION_PATHS ,
43+ )
3344
3445from .rocmdata import RocmDataModel
3546
@@ -40,18 +51,6 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]):
4051 SUPPORTED_OS_FAMILY : set [OSFamily ] = {OSFamily .LINUX }
4152
4253 DATA_MODEL = RocmDataModel
43- CMD_VERSION_PATHS = [
44- "/opt/rocm/.info/version-rocm" ,
45- "/opt/rocm/.info/version" ,
46- ]
47- CMD_ROCMINFO = "{rocm_path}/bin/rocminfo"
48- CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1"
49- CMD_ROCM_DIRS = "ls -v -d /opt/rocm*"
50- CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*"
51- CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'"
52- CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'"
53- CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo"
54- CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/"
5554
5655 @staticmethod
5756 def _strip_ansi_codes (text : str ) -> str :
@@ -73,42 +72,49 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
7372 Returns:
7473 tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available.
7574 """
76- version_paths = [
77- "/opt/rocm/.info/version-rocm" ,
78- "/opt/rocm/.info/version" ,
79- ]
80-
8175 rocm_data = None
82- for path in self . CMD_VERSION_PATHS :
76+ for path in CMD_VERSION_PATHS :
8377 res = self ._run_sut_cmd (f"grep . { path } " )
8478 if res .exit_code == 0 :
85- rocm_data = RocmDataModel (rocm_version = res .stdout )
86- self ._log_event (
87- category = "ROCM_VERSION_READ" ,
88- description = "ROCm version data collected" ,
89- data = rocm_data .model_dump (include = {"rocm_version" }),
90- priority = EventPriority .INFO ,
91- )
92- self .result .message = f"ROCm version: { rocm_data .rocm_version } "
93- self .result .status = ExecutionStatus .OK
94- break
79+ try :
80+ rocm_data = RocmDataModel (rocm_version = res .stdout )
81+ self ._log_event (
82+ category = "ROCM_VERSION_READ" ,
83+ description = "ROCm version data collected" ,
84+ data = rocm_data .model_dump (include = {"rocm_version" }),
85+ priority = EventPriority .INFO ,
86+ )
87+ self .result .message = f"ROCm version: { rocm_data .rocm_version } "
88+ self .result .status = ExecutionStatus .OK
89+ break
90+ except ValueError as e :
91+ self ._log_event (
92+ category = EventCategory .OS ,
93+ description = f"Invalid ROCm version format: { res .stdout } " ,
94+ data = {"version" : res .stdout , "error" : str (e )},
95+ priority = EventPriority .ERROR ,
96+ console_log = True ,
97+ )
98+ self .result .message = f"Invalid ROCm version format: { res .stdout } "
99+ self .result .status = ExecutionStatus .ERROR
100+ return self .result , None
95101 else :
96102 self ._log_event (
97103 category = EventCategory .OS ,
98- description = f"Unable to read ROCm version from { version_paths } " ,
104+ description = f"Unable to read ROCm version from { CMD_VERSION_PATHS } " ,
99105 data = {"raw_output" : res .stdout },
100106 priority = EventPriority .ERROR ,
101107 )
102108
103109 # Collect additional ROCm data if version was found
104110 if rocm_data :
105111 # Collect latest versioned ROCm path (rocm-[3-7]*)
106- versioned_path_res = self ._run_sut_cmd (self . CMD_ROCM_LATEST )
112+ versioned_path_res = self ._run_sut_cmd (CMD_ROCM_LATEST )
107113 if versioned_path_res .exit_code == 0 :
108114 rocm_data .rocm_latest_versioned_path = versioned_path_res .stdout .strip ()
109115
110116 # Collect all ROCm paths as list
111- all_paths_res = self ._run_sut_cmd (self . CMD_ROCM_DIRS )
117+ all_paths_res = self ._run_sut_cmd (CMD_ROCM_DIRS )
112118 if all_paths_res .exit_code == 0 :
113119 rocm_data .rocm_all_paths = [
114120 path .strip ()
@@ -120,7 +126,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
120126 rocm_path = rocm_data .rocm_latest_versioned_path or "/opt/rocm"
121127
122128 # Collect rocminfo output as list of lines with ANSI codes stripped
123- rocminfo_cmd = self . CMD_ROCMINFO .format (rocm_path = rocm_path )
129+ rocminfo_cmd = CMD_ROCMINFO .format (rocm_path = rocm_path )
124130 rocminfo_res = self ._run_sut_cmd (rocminfo_cmd )
125131 rocminfo_artifact_content = ""
126132 if rocminfo_res .exit_code == 0 :
@@ -134,14 +140,14 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
134140 rocminfo_artifact_content += rocminfo_res .stdout
135141
136142 # Collect ld.so.conf ROCm entries
137- ld_conf_res = self ._run_sut_cmd (self . CMD_LD_CONF )
143+ ld_conf_res = self ._run_sut_cmd (CMD_LD_CONF )
138144 if ld_conf_res .exit_code == 0 :
139145 rocm_data .ld_conf_rocm = [
140146 line .strip () for line in ld_conf_res .stdout .strip ().split ("\n " ) if line .strip ()
141147 ]
142148
143149 # Collect ROCm libraries from ldconfig
144- rocm_libs_res = self ._run_sut_cmd (self . CMD_ROCM_LIBS )
150+ rocm_libs_res = self ._run_sut_cmd (CMD_ROCM_LIBS )
145151 if rocm_libs_res .exit_code == 0 :
146152 rocm_data .rocm_libs = [
147153 line .strip ()
@@ -150,14 +156,14 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
150156 ]
151157
152158 # Collect ROCm-related environment variables
153- env_vars_res = self ._run_sut_cmd (self . CMD_ENV_VARS )
159+ env_vars_res = self ._run_sut_cmd (CMD_ENV_VARS )
154160 if env_vars_res .exit_code == 0 :
155161 rocm_data .env_vars = [
156162 line .strip () for line in env_vars_res .stdout .strip ().split ("\n " ) if line .strip ()
157163 ]
158164
159165 # Collect clinfo output
160- clinfo_cmd = self . CMD_CLINFO .format (rocm_path = rocm_path )
166+ clinfo_cmd = CMD_CLINFO .format (rocm_path = rocm_path )
161167 clinfo_res = self ._run_sut_cmd (clinfo_cmd )
162168
163169 # Always append clinfo section to artifact, even if empty or failed
@@ -188,7 +194,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
188194 )
189195
190196 # Collect KFD process list
191- kfd_proc_res = self ._run_sut_cmd (self . CMD_KFD_PROC )
197+ kfd_proc_res = self ._run_sut_cmd (CMD_KFD_PROC )
192198 if kfd_proc_res .exit_code == 0 :
193199 rocm_data .kfd_proc = [
194200 proc .strip () for proc in kfd_proc_res .stdout .strip ().split ("\n " ) if proc .strip ()
0 commit comments