Skip to content

Commit 98e3466

Browse files
committed
fix: failed on detection in amd
Signed-off-by: thxCode <[email protected]>
1 parent 17911b3 commit 98e3466

File tree

5 files changed

+65
-33
lines changed

5 files changed

+65
-33
lines changed

gpustack_runtime/detector/amd.py

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from .__types__ import Detector, Device, Devices, ManufacturerEnum
1010
from .__utils__ import (
1111
PCIDevice,
12+
byte_to_mebibyte,
1213
get_brief_version,
1314
get_pci_devices,
1415
get_utilization,
@@ -141,20 +142,48 @@ def detect(self) -> Devices | None:
141142
elif dev_gpudev_info and hasattr(dev_gpudev_info, "cu_active_number"):
142143
dev_cores = dev_gpudev_info.cu_active_number
143144

144-
dev_gpu_metrics_info = pyamdsmi.amdsmi_get_gpu_metrics_info(dev)
145-
dev_cores_util = dev_gpu_metrics_info.get("average_gfx_activity", 0)
146-
dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev)
147-
dev_mem = dev_gpu_vram_usage.get("vram_total")
148-
dev_mem_used = dev_gpu_vram_usage.get("vram_used")
149-
dev_temp = dev_gpu_metrics_info.get("temperature_hotspot", 0)
150-
151-
dev_power_info = pyamdsmi.amdsmi_get_power_info(dev)
152-
dev_power = dev_power_info.get("power_limit", 0) // 1000000 # uW to W
153-
dev_power_used = (
154-
dev_power_info.get("current_socket_power")
155-
if dev_power_info.get("current_socket_power", "N/A") != "N/A"
156-
else dev_power_info.get("average_socket_power", 0)
157-
)
145+
dev_cores_util, dev_temp = None, None
146+
try:
147+
dev_gpu_metrics_info = pyamdsmi.amdsmi_get_gpu_metrics_info(dev)
148+
dev_cores_util = dev_gpu_metrics_info.get("average_gfx_activity", 0)
149+
dev_temp = dev_gpu_metrics_info.get("temperature_hotspot", 0)
150+
except pyamdsmi.AmdSmiException:
151+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
152+
pyrocmsmi.rsmi_init()
153+
dev_cores_util = pyrocmsmi.rsmi_dev_busy_percent_get(dev_idx)
154+
dev_temp = pyrocmsmi.rsmi_dev_temp_metric_get(dev_idx)
155+
156+
dev_mem, dev_mem_used = None, None
157+
try:
158+
dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev)
159+
dev_mem = dev_gpu_vram_usage.get("vram_total")
160+
dev_mem_used = dev_gpu_vram_usage.get("vram_used")
161+
except pyamdsmi.AmdSmiException:
162+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
163+
pyrocmsmi.rsmi_init()
164+
dev_mem = byte_to_mebibyte( # byte to MiB
165+
pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
166+
)
167+
dev_mem_used = byte_to_mebibyte( # byte to MiB
168+
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
169+
)
170+
171+
dev_power, dev_power_used = None, None
172+
try:
173+
dev_power_info = pyamdsmi.amdsmi_get_power_info(dev)
174+
dev_power = (
175+
dev_power_info.get("power_limit", 0) // 1000000
176+
) # uW to W
177+
dev_power_used = (
178+
dev_power_info.get("current_socket_power")
179+
if dev_power_info.get("current_socket_power", "N/A") != "N/A"
180+
else dev_power_info.get("average_socket_power", 0)
181+
)
182+
except pyamdsmi.AmdSmiException:
183+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
184+
pyrocmsmi.rsmi_init()
185+
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
186+
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
158187

159188
dev_compute_partition = None
160189
with contextlib.suppress(pyamdsmi.AmdSmiException):

gpustack_runtime/detector/pyamdgpu/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ def _LoadAMDGPULibrary():
221221
# Linux path
222222
locs = [
223223
"libdrm_amdgpu.so.1.0.0",
224+
"libdrm_amdgpu.so.1",
224225
"libdrm_amdgpu.so",
225226
]
226227
for loc in locs:

pack/Dockerfile

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# - GPUSTACK_RUNTIME_DOCKER_MIRRORED_NAME_FILTER_LABELS: Semicolon-separated list of labels to filter mirrored images when deploying mirrored deployment.
1919
ARG PYTHON_VERSION=3.11
2020
ARG GPUSTACK_RUNTIME_BASE_IMAGE=runtime
21-
ARG GPUSTACK_RUNTIME_ROCM_VERSION=6.2.4
21+
ARG GPUSTACK_RUNTIME_ROCM_VERSION=6.4.4
2222
ARG GPUSTACK_RUNTIME_DOCKER_MIRRORED_NAME_FILTER_LABELS
2323

2424
FROM ubuntu:22.04@sha256:3c61d3759c2639d4b836d32a2d3c83fa0214e36f195a3421018dbaaf79cbe37f AS runtime
@@ -346,6 +346,21 @@ RUN --mount=type=bind,from=rocm-base,source=/opt/rocm/share,target=/opt/rocm/sha
346346
uv pip install --no-build-isolation \
347347
/opt/rocm/share/amd_smi
348348
uv pip tree
349+
350+
# Hack to avoid: Fail to open libdrm_amdgpu.so: libdrm_amdgpu.so: cannot open shared object file: No such file or directory
351+
TARGET_DIR="/usr/lib/$(uname -m)-linux-gnu"
352+
TARGET_LIB="libdrm_amdgpu.so.1.0.0"
353+
TARGET_LINK="libdrm_amdgpu.so"
354+
TARGET_LIB_EXISTED="true"
355+
if [[ ! -e "${TARGET_DIR}/${TARGET_LIB}" ]]; then
356+
TARGET_LIB_EXISTED="false"
357+
touch "${TARGET_DIR}/${TARGET_LIB}"
358+
fi
359+
pushd "${TARGET_DIR}" \
360+
&& ln -sf "${TARGET_LIB}" "${TARGET_LINK}"
361+
if [[ "${TARGET_LIB_EXISTED}" == "false" ]]; then
362+
rm -f "${TARGET_DIR}/${TARGET_LIB}"
363+
fi
349364
EOF
350365
ENV AMD_VISIBLE_DEVICES="all" \
351366
GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/opt/rocm"

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,5 @@ dev=[
4646
"mkdocs-material>=9.6.18",
4747
"pymdown-extensions>=10.16.1",
4848
"dockerpty>=0.4.1",
49-
"amdsmi==6.2.4",
49+
"amdsmi>=6.4.4",
5050
]

uv.lock

Lines changed: 4 additions & 17 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)