2626from typing import Optional
2727
2828from nodescraper .base import InBandDataCollector
29+ from nodescraper .connection .inband import TextFileArtifact
2930from nodescraper .enums import EventCategory , EventPriority , ExecutionStatus , OSFamily
3031from nodescraper .models import TaskResult
32+ from nodescraper .utils import strip_ansi_codes
3133
3234from .rocmdata import RocmDataModel
3335
@@ -42,40 +44,149 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]):
4244 "/opt/rocm/.info/version-rocm" ,
4345 "/opt/rocm/.info/version" ,
4446 ]
47+ CMD_ROCMINFO = "{rocm_path}/bin/rocminfo"
48+ CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1"
49+ CMD_ROCM_DIRS = "ls -v -d /opt/rocm*"
50+ CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*"
51+ CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'"
52+ CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'"
53+ CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo"
54+ CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/"
4555
4656 def collect_data (self , args = None ) -> tuple [TaskResult , Optional [RocmDataModel ]]:
4757 """Collect ROCm version data from the system.
4858
4959 Returns:
5060 tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available.
5161 """
52- version_paths = [
53- "/opt/rocm/.info/version-rocm" ,
54- "/opt/rocm/.info/version" ,
55- ]
56-
5762 rocm_data = None
5863 for path in self .CMD_VERSION_PATHS :
5964 res = self ._run_sut_cmd (f"grep . { path } " )
6065 if res .exit_code == 0 :
61- rocm_data = RocmDataModel (rocm_version = res .stdout )
62- self ._log_event (
63- category = "ROCM_VERSION_READ" ,
64- description = "ROCm version data collected" ,
65- data = rocm_data .model_dump (),
66- priority = EventPriority .INFO ,
67- )
68- self .result .message = f"ROCm: { rocm_data .model_dump ()} "
69- self .result .status = ExecutionStatus .OK
70- break
66+ try :
67+ rocm_data = RocmDataModel (rocm_version = res .stdout )
68+ self ._log_event (
69+ category = "ROCM_VERSION_READ" ,
70+ description = "ROCm version data collected" ,
71+ data = rocm_data .model_dump (include = {"rocm_version" }),
72+ priority = EventPriority .INFO ,
73+ )
74+ self .result .message = f"ROCm version: { rocm_data .rocm_version } "
75+ self .result .status = ExecutionStatus .OK
76+ break
77+ except ValueError as e :
78+ self ._log_event (
79+ category = EventCategory .OS ,
80+ description = f"Invalid ROCm version format: { res .stdout } " ,
81+ data = {"version" : res .stdout , "error" : str (e )},
82+ priority = EventPriority .ERROR ,
83+ console_log = True ,
84+ )
85+ self .result .message = f"Invalid ROCm version format: { res .stdout } "
86+ self .result .status = ExecutionStatus .ERROR
87+ return self .result , None
7188 else :
7289 self ._log_event (
7390 category = EventCategory .OS ,
74- description = f"Unable to read ROCm version from { version_paths } " ,
91+ description = f"Unable to read ROCm version from { self . CMD_VERSION_PATHS } " ,
7592 data = {"raw_output" : res .stdout },
7693 priority = EventPriority .ERROR ,
7794 )
7895
96+ # Collect additional ROCm data if version was found
97+ if rocm_data :
98+ # Collect latest versioned ROCm path (rocm-[3-7]*)
99+ versioned_path_res = self ._run_sut_cmd (self .CMD_ROCM_LATEST )
100+ if versioned_path_res .exit_code == 0 :
101+ rocm_data .rocm_latest_versioned_path = versioned_path_res .stdout .strip ()
102+
103+ # Collect all ROCm paths as list
104+ all_paths_res = self ._run_sut_cmd (self .CMD_ROCM_DIRS )
105+ if all_paths_res .exit_code == 0 :
106+ rocm_data .rocm_all_paths = [
107+ path .strip ()
108+ for path in all_paths_res .stdout .strip ().split ("\n " )
109+ if path .strip ()
110+ ]
111+
112+ # Determine ROCm path for commands that need it
113+ rocm_path = rocm_data .rocm_latest_versioned_path or "/opt/rocm"
114+
115+ # Collect rocminfo output as list of lines with ANSI codes stripped
116+ rocminfo_cmd = self .CMD_ROCMINFO .format (rocm_path = rocm_path )
117+ rocminfo_res = self ._run_sut_cmd (rocminfo_cmd )
118+ rocminfo_artifact_content = ""
119+ if rocminfo_res .exit_code == 0 :
120+ # Split into lines and strip ANSI codes from each line
121+ rocm_data .rocminfo = [
122+ strip_ansi_codes (line ) for line in rocminfo_res .stdout .strip ().split ("\n " )
123+ ]
124+ rocminfo_artifact_content += "=" * 80 + "\n "
125+ rocminfo_artifact_content += "ROCMNFO OUTPUT\n "
126+ rocminfo_artifact_content += "=" * 80 + "\n \n "
127+ rocminfo_artifact_content += rocminfo_res .stdout
128+
129+ # Collect ld.so.conf ROCm entries
130+ ld_conf_res = self ._run_sut_cmd (self .CMD_LD_CONF )
131+ if ld_conf_res .exit_code == 0 :
132+ rocm_data .ld_conf_rocm = [
133+ line .strip () for line in ld_conf_res .stdout .strip ().split ("\n " ) if line .strip ()
134+ ]
135+
136+ # Collect ROCm libraries from ldconfig
137+ rocm_libs_res = self ._run_sut_cmd (self .CMD_ROCM_LIBS )
138+ if rocm_libs_res .exit_code == 0 :
139+ rocm_data .rocm_libs = [
140+ line .strip ()
141+ for line in rocm_libs_res .stdout .strip ().split ("\n " )
142+ if line .strip ()
143+ ]
144+
145+ # Collect ROCm-related environment variables
146+ env_vars_res = self ._run_sut_cmd (self .CMD_ENV_VARS )
147+ if env_vars_res .exit_code == 0 :
148+ rocm_data .env_vars = [
149+ line .strip () for line in env_vars_res .stdout .strip ().split ("\n " ) if line .strip ()
150+ ]
151+
152+ # Collect clinfo output
153+ clinfo_cmd = self .CMD_CLINFO .format (rocm_path = rocm_path )
154+ clinfo_res = self ._run_sut_cmd (clinfo_cmd )
155+
156+ # Always append clinfo section to artifact, even if empty or failed
157+ if rocminfo_artifact_content :
158+ rocminfo_artifact_content += "\n \n "
159+ rocminfo_artifact_content += "=" * 80 + "\n "
160+ rocminfo_artifact_content += "CLINFO OUTPUT\n "
161+ rocminfo_artifact_content += "=" * 80 + "\n \n "
162+
163+ if clinfo_res .exit_code == 0 :
164+ rocm_data .clinfo = [
165+ strip_ansi_codes (line ) for line in clinfo_res .stdout .strip ().split ("\n " )
166+ ]
167+ rocminfo_artifact_content += clinfo_res .stdout
168+ else :
169+ # Add error information if clinfo failed
170+ rocminfo_artifact_content += f"Command: { clinfo_res .command } \n "
171+ rocminfo_artifact_content += f"Exit Code: { clinfo_res .exit_code } \n "
172+ if clinfo_res .stderr :
173+ rocminfo_artifact_content += f"Error: { clinfo_res .stderr } \n "
174+ if clinfo_res .stdout :
175+ rocminfo_artifact_content += f"Output: { clinfo_res .stdout } \n "
176+
177+ # Add combined rocminfo and clinfo output as a text file artifact
178+ if rocminfo_artifact_content :
179+ self .result .artifacts .append (
180+ TextFileArtifact (filename = "rocminfo.log" , contents = rocminfo_artifact_content )
181+ )
182+
183+ # Collect KFD process list
184+ kfd_proc_res = self ._run_sut_cmd (self .CMD_KFD_PROC )
185+ if kfd_proc_res .exit_code == 0 :
186+ rocm_data .kfd_proc = [
187+ proc .strip () for proc in kfd_proc_res .stdout .strip ().split ("\n " ) if proc .strip ()
188+ ]
189+
79190 if not rocm_data :
80191 self ._log_event (
81192 category = EventCategory .OS ,
0 commit comments