Skip to content

Commit 51799a8

Browse files
committed
fix: failed on hygon detection
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent c5c3d1d commit 51799a8

File tree

4 files changed

+66
-33
lines changed

4 files changed

+66
-33
lines changed

gpustack_runtime/detector/hygon.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import logging
44
from functools import lru_cache
5+
from pathlib import Path
56

67
from .. import envs
78
from . import pyrocmsmi
@@ -77,7 +78,11 @@ def detect(self) -> Devices | None:
7778
try:
7879
pyrocmsmi.rsmi_init()
7980

80-
sys_driver_ver = pyrocmsmi.rsmi_driver_version_get()
81+
sys_driver_ver = None
82+
sys_driver_ver_path = Path("/sys/module/hydcu/version")
83+
if sys_driver_ver_path.exists():
84+
with sys_driver_ver_path.open(encoding="utf-8") as f:
85+
sys_driver_ver = f.read().strip()
8186

8287
devs_count = pyrocmsmi.rsmi_num_monitor_devices()
8388
for dev_idx in range(devs_count):

gpustack_runtime/detector/pyamdsmi/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ def amdsmi_get_rocm_original_version() -> str | None:
3333
locs = [
3434
"librocm-core.so",
3535
]
36-
rocm_path = Path(os.getenv("ROCM_PATH", os.getenv("ROCM_HOME") or "/opt/rocm"))
36+
rocm_path = Path(os.getenv("ROCM_HOME", os.getenv("ROCM_PATH") or "/opt/rocm"))
3737
if rocm_path.exists():
38-
locs.append(str(rocm_path / "lib/librocm-core.so"))
38+
locs.append(str(rocm_path / "lib" / "librocm-core.so"))
3939
for loc in locs:
4040
try:
4141
clib = CDLL(loc)

gpustack_runtime/detector/pyrocmsmi/__init__.py

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,31 @@
2020
libLoadLock = threading.Lock()
2121

2222
if rocmsmiLib is None:
23-
rocm_path = Path(os.getenv("ROCM_PATH", os.getenv("ROCM_HOME") or "/opt/rocm"))
24-
23+
# Example ROCM_SMI_LIB_PATH
24+
# - /opt/dtk-24.04.3/rocm_smi/lib
25+
# - /opt/rocm/rocm_smi/lib
2526
rocmsmi_lib_path = os.getenv("ROCM_SMI_LIB_PATH")
2627
if not rocmsmi_lib_path:
27-
rocmsmi_lib_path = str(rocm_path / "lib")
28+
# Example ROCM_PATH/ROCM_HOME
29+
# - /opt/dtk-24.04.3
30+
# - /opt/rocm
31+
rocm_path = Path(os.getenv("ROCM_HOME", os.getenv("ROCM_PATH") or "/opt/rocm"))
32+
rocmsmi_lib_path = str(rocm_path / "rocm_smi" / "lib")
33+
else:
34+
rocm_path = Path(rocmsmi_lib_path).parent.parent
2835

2936
rocmsmi_lib_loc = Path(rocmsmi_lib_path) / "librocm_smi64.so"
3037
if rocmsmi_lib_loc.exists():
31-
rocmsmi_bindings_path = rocm_path / "libexec" / "rocm_smi"
38+
rocmsmi_bindings_paths = [
39+
(rocm_path / "rocm_smi" / "bindings"),
40+
(rocm_path / "libexec" / "rocm_smi"),
41+
]
42+
rocmsmi_bindings_path = None
43+
for p in rocmsmi_bindings_paths:
44+
if p.exists():
45+
rocmsmi_bindings_path = p
46+
break
47+
3248
if rocmsmi_bindings_path.exists():
3349
if str(rocmsmi_bindings_path) not in sys.path:
3450
sys.path.append(str(rocmsmi_bindings_path))
@@ -197,18 +213,21 @@ def rsmi_dev_target_graphics_version_get(device=0):
197213
if not rocmsmiLib:
198214
raise ROCMSMIError(ROCMSMI_ERROR_UNINITIALIZED)
199215

200-
c_version = c_uint64()
201-
ret = rocmsmiLib.rsmi_dev_target_graphics_version_get(device, byref(c_version))
202-
_rocmsmiCheckReturn(ret)
203-
version = str(c_version.value)
204-
if len(version) == 4:
205-
dev_name = rsmi_dev_name_get(device)
206-
if "Instinct MI2" in dev_name:
207-
hex_part = str(hex(int(version[2:]))).replace("0x", "")
208-
version = version[:2] + hex_part
209-
else:
210-
version = str(c_version.value // 10 + c_version.value % 10)
211-
return "gfx" + version
216+
try:
217+
c_version = c_uint64()
218+
ret = rocmsmiLib.rsmi_dev_target_graphics_version_get(device, byref(c_version))
219+
_rocmsmiCheckReturn(ret)
220+
version = str(c_version.value)
221+
if len(version) == 4:
222+
dev_name = rsmi_dev_name_get(device)
223+
if "Instinct MI2" in dev_name:
224+
hex_part = str(hex(int(version[2:]))).replace("0x", "")
225+
version = version[:2] + hex_part
226+
else:
227+
version = str(c_version.value // 10 + c_version.value % 10)
228+
return "gfx" + version
229+
except AttributeError:
230+
return None
212231

213232

214233
def rsmi_dev_temp_metric_get(device=0, sensor=None, metric=None):
@@ -240,17 +259,32 @@ def rsmi_dev_power_cap_get(device=0):
240259
return c_power_cap.value // 1000000
241260

242261

243-
def rsmi_dev_power_get(device=0):
262+
def rsmi_dev_power_ave_get(device=0):
244263
if not rocmsmiLib:
245264
raise ROCMSMIError(ROCMSMI_ERROR_UNINITIALIZED)
246265

266+
c_device_chip = c_uint32(0)
247267
c_power = c_uint64()
248-
c_power_type = rsmi_power_type_t()
249-
ret = rocmsmiLib.rsmi_dev_power_get(device, byref(c_power), byref(c_power_type))
268+
ret = rocmsmiLib.rsmi_dev_power_ave_get(device, c_device_chip, byref(c_power))
250269
_rocmsmiCheckReturn(ret)
251270
return c_power.value // 1000000
252271

253272

273+
def rsmi_dev_power_get(device=0):
274+
if not rocmsmiLib:
275+
raise ROCMSMIError(ROCMSMI_ERROR_UNINITIALIZED)
276+
277+
try:
278+
c_power = c_uint64()
279+
c_power_type = rsmi_power_type_t()
280+
ret = rocmsmiLib.rsmi_dev_power_get(device, byref(c_power), byref(c_power_type))
281+
_rocmsmiCheckReturn(ret)
282+
return c_power.value // 1000000
283+
except NameError:
284+
# Fallback for older versions without rsmi_dev_power_get
285+
return rsmi_dev_power_ave_get(device)
286+
287+
254288
def rsmi_dev_node_id_get(device=0):
255289
if not rocmsmiLib:
256290
raise ROCMSMIError(ROCMSMI_ERROR_UNINITIALIZED)

pack/Dockerfile

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -244,14 +244,6 @@ ARG TARGETARCH
244244
VOLUME /var/lib/gpustack
245245
ENV GPUSTACK_RUNTIME_DOCKER_EPHEMERAL_FILES_DIR="/var/lib/gpustack/cache/gpustack-runtime"
246246

247-
## Update
248-
RUN <<EOF
249-
# Update
250-
251-
# Update PCI IDs database for better device name detection.
252-
curl -o /usr/share/misc/pci.ids https://pci-ids.ucw.cz/v2.2/pci.ids
253-
EOF
254-
255247
## Install
256248

257249
RUN --mount=type=cache,target=/root/.cache \
@@ -269,6 +261,9 @@ RUN --mount=type=cache,target=/root/.cache \
269261
uv pip install \
270262
/workspace/runtime
271263
uv pip tree
264+
265+
# Update PCI IDs
266+
update-pciids || true
272267
EOF
273268

274269
## Entrypoint
@@ -279,7 +274,6 @@ EOF
279274
## Options:
280275
## - Mount /sys from the host to detect the correct devices' PCI info.
281276
## - Mount /opt/rocm from the host to detect the correct ROCm version.
282-
## - Mount /usr/share/misc/pci.ids from the host to detect the correct device name.
283277
## E.g. docker run --rm -it --privileged -v /var/run/docker.sock:/var/run/docker.sock gpustack/runtime:main gpustack-runtime detect --format json
284278
RUN --mount=type=bind,from=rocm-base,source=/opt/rocm/share,target=/opt/rocm/share,rw <<EOF
285279
# Reinstall amd-smi
@@ -291,7 +285,6 @@ RUN --mount=type=bind,from=rocm-base,source=/opt/rocm/share,target=/opt/rocm/sha
291285
uv pip tree
292286
EOF
293287
ENV AMD_VISIBLE_DEVICES="0" \
294-
ROCM_HOME="/opt/rocm" \
295288
GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/opt/rocm"
296289

297290
## Active all Ascend devices detection,
@@ -321,7 +314,8 @@ ENV CAMBRICON_VISIBLE_DEVICES="0"
321314
## See https://github.com/Project-HAMi/dcu-dcgm/blob/master/pkg/dcgm/include/rocm_smi.h.
322315
## Options:
323316
## - Mount /sys from the host to detect the correct devices' PCI info.
324-
## E.g. docker run --rm -it --privileged -v /var/run/docker.sock:/var/run/docker.sock gpustack/runtime:main gpustack-runtime detect --format json
317+
## - Mount /usr/share/hwdata and /usr/share/misc from the host to detect the correct device name.
318+
## E.g. docker run --rm -it --privileged -v /var/run/docker.sock:/var/run/docker.sock -v /opt/hyhal:/opt/hyhal -v /opt/dtk:/opt/dtk -e ROCM_PATH=/opt/dtk gpustack/runtime:main gpustack-runtime detect --format json
325319
ENV HYGON_VISIBLE_DEVICES="0" \
326320
GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/opt/dtk;${GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES}"
327321

0 commit comments

Comments
 (0)