Skip to content

Commit 972c196

Browse files
committed
fix: failed on amd mixed nvidia
Signed-off-by: thxCode <[email protected]>
1 parent 1c80322 commit 972c196

File tree

5 files changed

+22
-23
lines changed

5 files changed

+22
-23
lines changed

gpustack_runtime/detector/amd.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,14 @@
22

33
import contextlib
44
import logging
5-
import os
65
from functools import lru_cache
7-
from pathlib import Path
86

97
from .. import envs
108
from . import pyamdgpu, pyamdsmi, pyrocmsmi
119
from .__types__ import Detector, Device, Devices, ManufacturerEnum
1210
from .__utils__ import (
1311
PCIDevice,
1412
get_brief_version,
15-
get_device_files,
1613
get_pci_devices,
1714
get_utilization,
1815
)
@@ -91,24 +88,21 @@ def detect(self) -> Devices | None:
9188
sys_runtime_ver = get_brief_version(sys_runtime_ver_original)
9289

9390
devs = pyamdsmi.amdsmi_get_processor_handles()
94-
dev_files = get_device_files(
95-
pattern=r"card(?P<number>\d+)",
96-
directory="/dev/dri",
97-
)
9891
for dev_idx, dev in enumerate(devs):
99-
dev_card = None
10092
dev_index = dev_idx
101-
if len(dev_files) >= len(devs):
102-
dev_file = dev_files[dev_idx]
103-
if dev_file.number is not None:
104-
dev_card = dev_file.number
93+
94+
dev_card = None
95+
if hasattr(pyamdsmi, "amdsmi_get_gpu_kfd_info"):
96+
dev_kfd_info = pyamdsmi.amdsmi_get_gpu_kfd_info(dev)
97+
dev_card = dev_kfd_info.get("node_id")
98+
else:
99+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
100+
pyrocmsmi.rsmi_init()
101+
dev_card = pyrocmsmi.rsmi_dev_node_id_get(dev_idx)
105102

106103
dev_gpudev_info = None
107104
if dev_card is not None:
108-
with (
109-
contextlib.redirect_stderr(Path(os.devnull).open("w")),
110-
contextlib.suppress(pyamdgpu.AMDGPUError),
111-
):
105+
with contextlib.suppress(pyamdgpu.AMDGPUError):
112106
_, _, dev_gpudev = pyamdgpu.amdgpu_device_initialize(
113107
dev_card,
114108
)

gpustack_runtime/detector/pyamdgpu/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,16 +254,12 @@ def amdgpu_device_initialize(card=1):
254254

255255

256256
def amdgpu_device_deinitialize(device):
257-
_LoadAMDGPULibrary()
258-
259257
fn = _amdgpuGetFunctionPointer("amdgpu_device_deinitialize")
260258
ret = fn(device)
261259
_amdgpuCheckReturn(ret)
262260

263261

264262
def amdgpu_query_gpu_info(device):
265-
_LoadAMDGPULibrary()
266-
267263
c_info = c_amdgpu_gpu_info()
268264
fn = _amdgpuGetFunctionPointer("amdgpu_query_gpu_info")
269265
ret = fn(device, byref(c_info))

gpustack_runtime/detector/pyrocmsmi/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,3 +239,13 @@ def rsmi_dev_power_get(device=0):
239239
ret = rocmsmiLib.rsmi_dev_power_get(device, byref(c_power), byref(c_power_type))
240240
_rocmsmiCheckReturn(ret)
241241
return c_power.value // 1000000
242+
243+
244+
def rsmi_dev_node_id_get(device=0):
245+
if not rocmsmiLib:
246+
raise ROCMSMIError(ROCMSMI_ERROR_UNINITIALIZED)
247+
248+
c_node_id = c_uint32()
249+
ret = rocmsmiLib.rsmi_dev_node_id_get(device, byref(c_node_id))
250+
_rocmsmiCheckReturn(ret)
251+
return c_node_id.value

gpustack_runtime/envs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@
209209
"cambricon.com/devices=CAMBRICON_VISIBLE_DEVICES;"
210210
"hygon.com/devices=HYGON_VISIBLE_DEVICES;"
211211
"iluvatar.ai/devices=ILUVATAR_VISIBLE_DEVICES;"
212-
"metax-tech.com/devices=METAX_VISIBLE_DEVICES;"
212+
"metax-tech.com/devices=CUDA_VISIBLE_DEVICES;"
213213
"mthreads.com/devices=METHERDS_VISIBLE_DEVICES;"
214214
"nvidia.com/devices=NVIDIA_VISIBLE_DEVICES;",
215215
),

pack/Dockerfile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,7 @@ ENV ILUVATAR_VISIBLE_DEVICES="0"
324324
## - Mount /opt/maca from the host to support device detecting.
325325
## - Mount /opt/mxdriver/ from the host to support device detecting.
326326
## E.g. docker run --rm -it --privileged -v /opt/mxdriver:/opt/mxdriver -v /opt/maca:/opt/maca gpustack/runtime:main gpustack-runtime detect --format json
327-
ENV METAX_VISIBLE_DEVICES="0" \
328-
LD_LIBRARY_PATH="/opt/maca/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib:${LD_LIBRARY_PATH}"
327+
ENV LD_LIBRARY_PATH="/opt/maca/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib:${LD_LIBRARY_PATH}"
329328

330329
## Active all MThread devices detection,
331330
## works with (default) MThread container runtime and privileged mode.

0 commit comments

Comments
 (0)