Skip to content

Commit 876c91c

Browse files
Merge branch 'development' into alex_devenum
2 parents fa929a1 + cb523bc commit 876c91c

File tree

16 files changed

+865
-72
lines changed

16 files changed

+865
-72
lines changed

.github/workflows/code_quality_checks.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ on:
1111
jobs:
1212
pre-commit:
1313
runs-on: [ self-hosted ]
14-
container: python:3.10
14+
container: python:3.9
1515

1616
steps:
1717
- uses: actions/checkout@v3
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: Python Functional Tests
2+
3+
on:
4+
workflow_dispatch:
5+
pull_request:
6+
push:
7+
branches: [ "main" ]
8+
9+
permissions:
10+
contents: read
11+
12+
jobs:
13+
run_tests:
14+
runs-on: [ self-hosted ]
15+
container: python:3.9
16+
17+
steps:
18+
- uses: actions/checkout@v3
19+
20+
- name: Install xmllint
21+
run: |
22+
apt-get update
23+
apt-get install -y libxml2-utils bc
24+
25+
- name: Install package and run functional tests
26+
id: run_functional_tests
27+
shell: bash
28+
run: |
29+
source ./dev-setup.sh
30+
pytest test/functional -s --disable-warnings -v

.github/workflows/unit-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ permissions:
1212
jobs:
1313
run_tests:
1414
runs-on: [ self-hosted ]
15-
container: python:3.10
15+
container: python:3.9
1616

1717
steps:
1818
- uses: actions/checkout@v3

nodescraper/plugins/inband/rocm/analyzer_args.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333

3434
class RocmAnalyzerArgs(AnalyzerArgs):
3535
exp_rocm: Union[str, list] = Field(default_factory=list)
36+
exp_rocm_latest: str = Field(default="")
3637

3738
@field_validator("exp_rocm", mode="before")
3839
@classmethod

nodescraper/plugins/inband/rocm/rocm_analyzer.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,17 +61,40 @@ def analyze_data(
6161
if data.rocm_version == rocm_version:
6262
self.result.message = "ROCm version matches expected"
6363
self.result.status = ExecutionStatus.OK
64+
break
65+
else:
66+
# No matching version found
67+
self.result.message = (
68+
f"ROCm version mismatch! Expected: {args.exp_rocm}, actual: {data.rocm_version}"
69+
)
70+
self.result.status = ExecutionStatus.ERROR
71+
self._log_event(
72+
category=EventCategory.SW_DRIVER,
73+
description=f"{self.result.message}",
74+
data={"expected": args.exp_rocm, "actual": data.rocm_version},
75+
priority=EventPriority.CRITICAL,
76+
console_log=True,
77+
)
78+
return self.result
79+
80+
# validate rocm_latest if provided in args
81+
if args.exp_rocm_latest:
82+
if data.rocm_latest_versioned_path != args.exp_rocm_latest:
83+
self.result.message = f"ROCm latest path mismatch! Expected: {args.exp_rocm_latest}, actual: {data.rocm_latest_versioned_path}"
84+
self.result.status = ExecutionStatus.ERROR
85+
self._log_event(
86+
category=EventCategory.SW_DRIVER,
87+
description=f"{self.result.message}",
88+
data={
89+
"expected": args.exp_rocm_latest,
90+
"actual": data.rocm_latest_versioned_path,
91+
},
92+
priority=EventPriority.CRITICAL,
93+
console_log=True,
94+
)
6495
return self.result
96+
else:
97+
# Update message to include rocm_latest validation result
98+
self.result.message = f"ROCm version matches expected. ROCm latest path validated: {data.rocm_latest_versioned_path}"
6599

66-
self.result.message = (
67-
f"ROCm version mismatch! Expected: {args.exp_rocm}, actual: {data.rocm_version}"
68-
)
69-
self.result.status = ExecutionStatus.ERROR
70-
self._log_event(
71-
category=EventCategory.SW_DRIVER,
72-
description=f"{self.result.message}",
73-
data={"expected": args.exp_rocm, "actual": data.rocm_version},
74-
priority=EventPriority.CRITICAL,
75-
console_log=True,
76-
)
77100
return self.result

nodescraper/plugins/inband/rocm/rocm_collector.py

Lines changed: 127 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@
2626
from typing import Optional
2727

2828
from nodescraper.base import InBandDataCollector
29+
from nodescraper.connection.inband import TextFileArtifact
2930
from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
3031
from nodescraper.models import TaskResult
32+
from nodescraper.utils import strip_ansi_codes
3133

3234
from .rocmdata import RocmDataModel
3335

@@ -42,40 +44,149 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]):
4244
"/opt/rocm/.info/version-rocm",
4345
"/opt/rocm/.info/version",
4446
]
47+
CMD_ROCMINFO = "{rocm_path}/bin/rocminfo"
48+
CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1"
49+
CMD_ROCM_DIRS = "ls -v -d /opt/rocm*"
50+
CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*"
51+
CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'"
52+
CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'"
53+
CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo"
54+
CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/"
4555

4656
def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
4757
"""Collect ROCm version data from the system.
4858
4959
Returns:
5060
tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available.
5161
"""
52-
version_paths = [
53-
"/opt/rocm/.info/version-rocm",
54-
"/opt/rocm/.info/version",
55-
]
56-
5762
rocm_data = None
5863
for path in self.CMD_VERSION_PATHS:
5964
res = self._run_sut_cmd(f"grep . {path}")
6065
if res.exit_code == 0:
61-
rocm_data = RocmDataModel(rocm_version=res.stdout)
62-
self._log_event(
63-
category="ROCM_VERSION_READ",
64-
description="ROCm version data collected",
65-
data=rocm_data.model_dump(),
66-
priority=EventPriority.INFO,
67-
)
68-
self.result.message = f"ROCm: {rocm_data.model_dump()}"
69-
self.result.status = ExecutionStatus.OK
70-
break
66+
try:
67+
rocm_data = RocmDataModel(rocm_version=res.stdout)
68+
self._log_event(
69+
category="ROCM_VERSION_READ",
70+
description="ROCm version data collected",
71+
data=rocm_data.model_dump(include={"rocm_version"}),
72+
priority=EventPriority.INFO,
73+
)
74+
self.result.message = f"ROCm version: {rocm_data.rocm_version}"
75+
self.result.status = ExecutionStatus.OK
76+
break
77+
except ValueError as e:
78+
self._log_event(
79+
category=EventCategory.OS,
80+
description=f"Invalid ROCm version format: {res.stdout}",
81+
data={"version": res.stdout, "error": str(e)},
82+
priority=EventPriority.ERROR,
83+
console_log=True,
84+
)
85+
self.result.message = f"Invalid ROCm version format: {res.stdout}"
86+
self.result.status = ExecutionStatus.ERROR
87+
return self.result, None
7188
else:
7289
self._log_event(
7390
category=EventCategory.OS,
74-
description=f"Unable to read ROCm version from {version_paths}",
91+
description=f"Unable to read ROCm version from {self.CMD_VERSION_PATHS}",
7592
data={"raw_output": res.stdout},
7693
priority=EventPriority.ERROR,
7794
)
7895

96+
# Collect additional ROCm data if version was found
97+
if rocm_data:
98+
# Collect latest versioned ROCm path (rocm-[3-7]*)
99+
versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_LATEST)
100+
if versioned_path_res.exit_code == 0:
101+
rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip()
102+
103+
# Collect all ROCm paths as list
104+
all_paths_res = self._run_sut_cmd(self.CMD_ROCM_DIRS)
105+
if all_paths_res.exit_code == 0:
106+
rocm_data.rocm_all_paths = [
107+
path.strip()
108+
for path in all_paths_res.stdout.strip().split("\n")
109+
if path.strip()
110+
]
111+
112+
# Determine ROCm path for commands that need it
113+
rocm_path = rocm_data.rocm_latest_versioned_path or "/opt/rocm"
114+
115+
# Collect rocminfo output as list of lines with ANSI codes stripped
116+
rocminfo_cmd = self.CMD_ROCMINFO.format(rocm_path=rocm_path)
117+
rocminfo_res = self._run_sut_cmd(rocminfo_cmd)
118+
rocminfo_artifact_content = ""
119+
if rocminfo_res.exit_code == 0:
120+
# Split into lines and strip ANSI codes from each line
121+
rocm_data.rocminfo = [
122+
strip_ansi_codes(line) for line in rocminfo_res.stdout.strip().split("\n")
123+
]
124+
rocminfo_artifact_content += "=" * 80 + "\n"
125+
rocminfo_artifact_content += "ROCMNFO OUTPUT\n"
126+
rocminfo_artifact_content += "=" * 80 + "\n\n"
127+
rocminfo_artifact_content += rocminfo_res.stdout
128+
129+
# Collect ld.so.conf ROCm entries
130+
ld_conf_res = self._run_sut_cmd(self.CMD_LD_CONF)
131+
if ld_conf_res.exit_code == 0:
132+
rocm_data.ld_conf_rocm = [
133+
line.strip() for line in ld_conf_res.stdout.strip().split("\n") if line.strip()
134+
]
135+
136+
# Collect ROCm libraries from ldconfig
137+
rocm_libs_res = self._run_sut_cmd(self.CMD_ROCM_LIBS)
138+
if rocm_libs_res.exit_code == 0:
139+
rocm_data.rocm_libs = [
140+
line.strip()
141+
for line in rocm_libs_res.stdout.strip().split("\n")
142+
if line.strip()
143+
]
144+
145+
# Collect ROCm-related environment variables
146+
env_vars_res = self._run_sut_cmd(self.CMD_ENV_VARS)
147+
if env_vars_res.exit_code == 0:
148+
rocm_data.env_vars = [
149+
line.strip() for line in env_vars_res.stdout.strip().split("\n") if line.strip()
150+
]
151+
152+
# Collect clinfo output
153+
clinfo_cmd = self.CMD_CLINFO.format(rocm_path=rocm_path)
154+
clinfo_res = self._run_sut_cmd(clinfo_cmd)
155+
156+
# Always append clinfo section to artifact, even if empty or failed
157+
if rocminfo_artifact_content:
158+
rocminfo_artifact_content += "\n\n"
159+
rocminfo_artifact_content += "=" * 80 + "\n"
160+
rocminfo_artifact_content += "CLINFO OUTPUT\n"
161+
rocminfo_artifact_content += "=" * 80 + "\n\n"
162+
163+
if clinfo_res.exit_code == 0:
164+
rocm_data.clinfo = [
165+
strip_ansi_codes(line) for line in clinfo_res.stdout.strip().split("\n")
166+
]
167+
rocminfo_artifact_content += clinfo_res.stdout
168+
else:
169+
# Add error information if clinfo failed
170+
rocminfo_artifact_content += f"Command: {clinfo_res.command}\n"
171+
rocminfo_artifact_content += f"Exit Code: {clinfo_res.exit_code}\n"
172+
if clinfo_res.stderr:
173+
rocminfo_artifact_content += f"Error: {clinfo_res.stderr}\n"
174+
if clinfo_res.stdout:
175+
rocminfo_artifact_content += f"Output: {clinfo_res.stdout}\n"
176+
177+
# Add combined rocminfo and clinfo output as a text file artifact
178+
if rocminfo_artifact_content:
179+
self.result.artifacts.append(
180+
TextFileArtifact(filename="rocminfo.log", contents=rocminfo_artifact_content)
181+
)
182+
183+
# Collect KFD process list
184+
kfd_proc_res = self._run_sut_cmd(self.CMD_KFD_PROC)
185+
if kfd_proc_res.exit_code == 0:
186+
rocm_data.kfd_proc = [
187+
proc.strip() for proc in kfd_proc_res.stdout.strip().split("\n") if proc.strip()
188+
]
189+
79190
if not rocm_data:
80191
self._log_event(
81192
category=EventCategory.OS,

nodescraper/plugins/inband/rocm/rocmdata.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#
2525
###############################################################################
2626
import re
27+
from typing import List
2728

2829
from pydantic import field_validator
2930

@@ -32,6 +33,14 @@
3233

3334
class RocmDataModel(DataModel):
3435
rocm_version: str
36+
rocminfo: List[str] = []
37+
rocm_latest_versioned_path: str = ""
38+
rocm_all_paths: List[str] = []
39+
ld_conf_rocm: List[str] = []
40+
rocm_libs: List[str] = []
41+
env_vars: List[str] = []
42+
clinfo: List[str] = []
43+
kfd_proc: List[str] = []
3544

3645
@field_validator("rocm_version")
3746
@classmethod

nodescraper/utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,3 +245,17 @@ def nice_rotated_name(path: str, stem: str, prefix: str = "rotated_") -> str:
245245

246246
middle = base[:-3] if base.endswith(".gz") else base
247247
return f"{prefix}{middle}.log"
248+
249+
250+
def strip_ansi_codes(text: str) -> str:
251+
"""
252+
Remove ANSI escape codes from text.
253+
254+
Args:
255+
text (str): The text string containing ANSI escape codes.
256+
257+
Returns:
258+
str: The text with ANSI escape codes removed.
259+
"""
260+
ansi_escape = re.compile(r"\x1b\[[0-9;]*m")
261+
return ansi_escape.sub("", text)

test/functional/__init__.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
###############################################################################
2+
#
3+
# MIT License
4+
#
5+
# Copyright (c) 2025 Advanced Micro Devices, Inc.
6+
#
7+
# Permission is hereby granted, free of charge, to any person obtaining a copy
8+
# of this software and associated documentation files (the "Software"), to deal
9+
# in the Software without restriction, including without limitation the rights
10+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
# copies of the Software, and to permit persons to whom the Software is
12+
# furnished to do so, subject to the following conditions:
13+
#
14+
# The above copyright notice and this permission notice shall be included in all
15+
# copies or substantial portions of the Software.
16+
#
17+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
# SOFTWARE.
24+
#
25+
###############################################################################
26+
"""Functional tests for node-scraper."""

0 commit comments

Comments
 (0)