Skip to content

Commit da78ea5

Browse files
committed
refactor: enhance get topology
Signed-off-by: thxCode <[email protected]>
1 parent 0a869e0 commit da78ea5

File tree

7 files changed

+288
-111
lines changed

7 files changed

+288
-111
lines changed

gpustack_runtime/detector/amd.py

Lines changed: 82 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import contextlib
44
import logging
55
from functools import lru_cache
6+
from pathlib import Path
67

78
from .. import envs
89
from ..logging import debug_log_exception, debug_log_warning
@@ -102,47 +103,47 @@ def detect(self) -> Devices | None:
102103
dev_index = dev_idx
103104

104105
dev_gpu_asic_info = pyamdsmi.amdsmi_get_gpu_asic_info(dev)
105-
dev_uuid = f"GPU-{(dev_gpu_asic_info.get('asic_serial')[2:]).lower()}"
106+
if dev_gpu_asic_info.get("asic_serial") != "N/A":
107+
asic_serial = dev_gpu_asic_info.get("asic_serial")
108+
dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
109+
else:
110+
dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
106111
dev_hsa_agent = hsa_agents.get(dev_uuid)
107112

108-
dev_card_id = None
109-
if dev_hsa_agent:
110-
dev_card_id = dev_hsa_agent.driver_node_id
111-
elif hasattr(pyamdsmi, "amdsmi_get_gpu_kfd_info"):
112-
dev_kfd_info = pyamdsmi.amdsmi_get_gpu_kfd_info(dev)
113-
dev_card_id = dev_kfd_info.get("node_id")
114-
else:
113+
dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
114+
dev_driver_ver = dev_gpu_driver_info.get("driver_version")
115+
116+
dev_name = dev_hsa_agent.name
117+
if not dev_name:
118+
dev_name = dev_gpu_asic_info.get("market_name")
119+
120+
dev_cc = dev_hsa_agent.compute_capability
121+
if not dev_cc:
115122
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
116-
dev_card_id = pyrocmsmi.rsmi_dev_node_id_get(dev_idx)
123+
dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
117124

118-
dev_gpudev_info = None
119-
if dev_card_id is not None:
125+
dev_bdf = None
126+
dev_card_id = None
127+
dev_renderd_id = None
128+
with contextlib.suppress(pyamdsmi.AmdSmiException):
129+
dev_bdf = pyamdsmi.amdsmi_get_gpu_device_bdf(dev)
130+
dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
131+
132+
dev_cores = dev_hsa_agent.compute_units
133+
dev_asic_family_id = dev_hsa_agent.asic_family_id
134+
if (
135+
not dev_cores or not dev_asic_family_id
136+
) and dev_card_id is not None:
120137
with contextlib.suppress(pyamdgpu.AMDGPUError):
121138
_, _, dev_gpudev = pyamdgpu.amdgpu_device_initialize(
122139
dev_card_id,
123140
)
124141
dev_gpudev_info = pyamdgpu.amdgpu_query_gpu_info(dev_gpudev)
125142
pyamdgpu.amdgpu_device_deinitialize(dev_gpudev)
126-
127-
dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
128-
dev_driver_ver = dev_gpu_driver_info.get("driver_version")
129-
130-
if dev_hsa_agent:
131-
dev_name = dev_hsa_agent.name
132-
if not dev_name:
133-
dev_name = dev_gpu_asic_info.get("market_name")
134-
dev_cc = dev_hsa_agent.compute_capability
135-
else:
136-
dev_name = dev_gpu_asic_info.get("market_name")
137-
dev_cc = None
138-
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
139-
dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
140-
141-
dev_cores = None
142-
if dev_hsa_agent:
143-
dev_cores = dev_hsa_agent.compute_units
144-
elif dev_gpudev_info and hasattr(dev_gpudev_info, "cu_active_number"):
145-
dev_cores = dev_gpudev_info.cu_active_number
143+
if not dev_cores:
144+
dev_cores = dev_gpudev_info.cu_active_number
145+
if not dev_asic_family_id:
146+
dev_asic_family_id = dev_gpudev_info.family_id
146147

147148
dev_cores_util = None
148149
dev_temp = None
@@ -201,24 +202,22 @@ def detect(self) -> Devices | None:
201202
)
202203

203204
dev_appendix = {
204-
"arch_family": _get_arch_family(dev_gpudev_info),
205+
"arch_family": _get_arch_family(dev_asic_family_id),
205206
"vgpu": dev_compute_partition is not None,
206-
"card_id": dev_card_id,
207207
}
208+
if dev_bdf:
209+
dev_appendix["bdf"] = dev_bdf
210+
if dev_card_id is not None:
211+
dev_appendix["card_id"] = dev_card_id
212+
if dev_renderd_id is not None:
213+
dev_appendix["renderd_id"] = dev_renderd_id
208214

209215
with contextlib.suppress(pyamdsmi.AmdSmiException):
210216
dev_xgmi = pyamdsmi.amdsmi_get_xgmi_info(dev)
211-
for k in [
212-
"xgmi_lanes",
213-
"xgmi_hive_id",
214-
"xgmi_node_id",
215-
]:
216-
if value := dev_xgmi.get(k):
217-
dev_appendix[k] = value
218-
219-
with contextlib.suppress(pyamdsmi.AmdSmiException):
220-
dev_bdf = pyamdsmi.amdsmi_get_gpu_device_bdf(dev)
221-
dev_appendix["bdf"] = dev_bdf
217+
if xgmi_lanes := dev_xgmi.get("xgmi_lanes", None):
218+
dev_appendix["xgmi_lanes"] = xgmi_lanes
219+
dev_appendix["xgmi_hive_id"] = dev_xgmi.get("xgmi_hive_id")
220+
dev_appendix["xgmi_node_id"] = dev_xgmi.get("xgmi_node_id")
222221

223222
ret.append(
224223
Device(
@@ -410,15 +409,20 @@ def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
410409
return topology
411410

412411

413-
def _get_arch_family(
414-
dev_gpudev_info: pyamdgpu.c_amdgpu_gpu_info | None,
415-
) -> str | None:
416-
if not dev_gpudev_info:
417-
return None
412+
def _get_arch_family(dev_family_id: int | None) -> str:
413+
"""
414+
Get the architecture family name from the device family ID.
415+
416+
Args:
417+
dev_family_id:
418+
The device family ID.
419+
420+
Returns:
421+
The architecture family as string.
418422
419-
family_id = dev_gpudev_info.family_id
420-
if family_id is None:
421-
return None
423+
"""
424+
if dev_family_id is None:
425+
return "Unknown"
422426

423427
arch_family = {
424428
pyamdgpu.AMDGPU_FAMILY_SI: "Southern Islands",
@@ -439,4 +443,29 @@ def _get_arch_family(
439443
pyamdgpu.AMDGPU_FAMILY_GC_12_0_0: "GC 12.0.0",
440444
}
441445

442-
return arch_family.get(family_id, "Unknown")
446+
return arch_family.get(dev_family_id, "Unknown")
447+
448+
449+
def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
450+
"""
451+
Get the card ID and renderD ID for a given device bdf.
452+
453+
Args:
454+
dev_bdf:
455+
The device bdf.
456+
457+
Returns:
458+
A tuple of (card_id, renderd_id).
459+
460+
"""
461+
card_id = None
462+
renderd_id = None
463+
drm_path = Path(f"/sys/module/amdgpu/drivers/pci:amdgpu/{dev_bdf}/drm")
464+
if drm_path.exists():
465+
for dir_path in drm_path.iterdir():
466+
if dir_path.name.startswith("card"):
467+
card_id = int(dir_path.name[4:])
468+
elif dir_path.name.startswith("renderD"):
469+
renderd_id = int(dir_path.name[7:])
470+
471+
return card_id, renderd_id

gpustack_runtime/detector/hygon.py

Lines changed: 69 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from .. import envs
99
from ..logging import debug_log_exception, debug_log_warning
10-
from . import Topology, pyhsa, pyrocmcore, pyrocmsmi
10+
from . import Topology, pyamdgpu, pyhsa, pyrocmcore, pyrocmsmi
1111
from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
1212
from .__utils__ import (
1313
PCIDevice,
@@ -91,13 +91,14 @@ def detect(self) -> Devices | None:
9191
pyrocmsmi.rsmi_init()
9292

9393
sys_driver_ver = None
94-
sys_driver_ver_path = Path("/sys/module/hydcu/version")
95-
if sys_driver_ver_path.exists():
96-
try:
97-
with sys_driver_ver_path.open(encoding="utf-8") as f:
98-
sys_driver_ver = f.read().strip()
99-
except OSError:
100-
pass
94+
for path in [
95+
Path("/sys/module/hycu/version"),
96+
Path("/sys/module/hydcu/version"),
97+
]:
98+
if path.exists():
99+
with contextlib.suppress(Exception):
100+
sys_driver_ver = path.read_text().strip()
101+
break
101102

102103
sys_runtime_ver_original = pyrocmcore.getROCmVersion()
103104
sys_runtime_ver = get_brief_version(sys_runtime_ver_original)
@@ -109,18 +110,34 @@ def detect(self) -> Devices | None:
109110
dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
110111
dev_hsa_agent = hsa_agents.get(dev_uuid)
111112

112-
if dev_hsa_agent:
113-
dev_name = dev_hsa_agent.name
114-
if not dev_name:
115-
dev_name = pyrocmsmi.rsmi_dev_name_get(dev_idx)
116-
dev_cc = dev_hsa_agent.compute_capability
117-
dev_cores = dev_hsa_agent.compute_units
118-
else:
113+
dev_name = dev_hsa_agent.name
114+
if not dev_name:
119115
dev_name = pyrocmsmi.rsmi_dev_name_get(dev_idx)
120-
dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
121-
dev_cores = None
116+
117+
dev_cc = dev_hsa_agent.compute_capability
118+
if not dev_cc:
119+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
120+
dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
121+
122+
dev_bdf = None
123+
dev_card_id = None
124+
dev_renderd_id = None
125+
with contextlib.suppress(Exception):
126+
dev_bdf = pyrocmsmi.rsmi_dev_pci_id_get(dev_idx)
127+
dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
128+
129+
dev_cores = dev_hsa_agent.compute_units
130+
if not dev_cores and dev_card_id is not None:
131+
with contextlib.suppress(pyamdgpu.AMDGPUError):
132+
_, _, dev_gpudev = pyamdgpu.amdgpu_device_initialize(
133+
dev_card_id,
134+
)
135+
dev_gpudev_info = pyamdgpu.amdgpu_query_gpu_info(dev_gpudev)
136+
pyamdgpu.amdgpu_device_deinitialize(dev_gpudev)
137+
dev_cores = dev_gpudev_info.cu_active_number
122138

123139
dev_cores_util = pyrocmsmi.rsmi_dev_busy_percent_get(dev_idx)
140+
dev_temp = pyrocmsmi.rsmi_dev_temp_metric_get(dev_idx)
124141
if dev_cores_util is None:
125142
debug_log_warning(
126143
logger,
@@ -136,18 +153,18 @@ def detect(self) -> Devices | None:
136153
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
137154
)
138155

139-
dev_temp = pyrocmsmi.rsmi_dev_temp_metric_get(dev_idx)
140-
141156
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
142157
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
143158

144159
dev_appendix = {
145160
"vgpu": False,
146161
}
147-
148-
with contextlib.suppress(Exception):
149-
dev_bdf = pyrocmsmi.rsmi_dev_pci_id_get(dev_idx)
162+
if dev_bdf is not None:
150163
dev_appendix["bdf"] = dev_bdf
164+
if dev_card_id is not None:
165+
dev_appendix["card_id"] = dev_card_id
166+
if dev_renderd_id is not None:
167+
dev_appendix["renderd_id"] = dev_renderd_id
151168

152169
ret.append(
153170
Device(
@@ -316,3 +333,33 @@ def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
316333
raise
317334

318335
return topology
336+
337+
338+
def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
339+
"""
340+
Get the card ID and renderD ID for a given device bdf.
341+
342+
Args:
343+
dev_bdf:
344+
The device bdf.
345+
346+
Returns:
347+
A tuple of (card_id, renderd_id).
348+
349+
"""
350+
card_id = None
351+
renderd_id = None
352+
353+
for path in [
354+
Path(f"/sys/module/hycu/drivers/pci:hycu/{dev_bdf}/drm"),
355+
Path(f"/sys/module/hydcu/drivers/pci:hydcu/{dev_bdf}/drm"),
356+
]:
357+
if path.exists():
358+
for dir_path in path.iterdir():
359+
if dir_path.name.startswith("card"):
360+
card_id = int(dir_path.name[4:])
361+
elif dir_path.name.startswith("renderD"):
362+
renderd_id = int(dir_path.name[7:])
363+
break
364+
365+
return card_id, renderd_id

gpustack_runtime/detector/pyhsa/__init__.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
##
44
from __future__ import annotations
55

6+
import contextlib
67
import os
78
import string
89
import sys
@@ -437,6 +438,12 @@ def hsa_agent_get_info_driver_node_id(agent):
437438
return c_driver_node_id.value
438439

439440

441+
def has_agent_get_asic_family_id(agent):
442+
c_family_id = c_uint32()
443+
hsa_agent_get_info(agent, HSA_AMD_AGENT_INFO_ASIC_FAMILY_ID, byref(c_family_id))
444+
return c_family_id.value
445+
446+
440447
@dataclass
441448
class Agent:
442449
device_type: int
@@ -445,7 +452,7 @@ class Agent:
445452
name: str
446453
compute_capability: str
447454
compute_units: int
448-
driver_node_id: int
455+
asic_family_id: int | None
449456

450457

451458
def get_agents() -> list[Agent]:
@@ -464,7 +471,9 @@ def has_agent_callback(agent) -> int:
464471
agent_name = hsa_agent_get_info_product_name(agent)
465472
agent_compute_capability = hsa_agent_get_info_name(agent)
466473
agent_compute_units = hsa_agent_get_info_compute_unit_count(agent)
467-
agent_driver_node_id = hsa_agent_get_info_driver_node_id(agent)
474+
agent_asic_family_id = None
475+
with contextlib.suppress(HSAError):
476+
agent_asic_family_id = has_agent_get_asic_family_id(agent)
468477

469478
agents.append(
470479
Agent(
@@ -474,7 +483,7 @@ def has_agent_callback(agent) -> int:
474483
name=agent_name,
475484
compute_capability=agent_compute_capability,
476485
compute_units=agent_compute_units,
477-
driver_node_id=agent_driver_node_id,
486+
asic_family_id=agent_asic_family_id,
478487
)
479488
)
480489

gpustack_runtime/detector/pyrocmsmi/__init__.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -324,15 +324,17 @@ def rsmi_dev_pci_id_get(device=0):
324324
ret = rocmsmiLib.rsmi_dev_pci_id_get(device, byref(c_pci_id))
325325
_rocmsmiCheckReturn(ret)
326326

327-
v_pci_id = c_pci_id.value
327+
return str_bdfid(c_pci_id.value)
328+
329+
330+
def str_bdfid(bdfid: int) -> str:
328331
# BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((Partition & 0xF) << 28)
329332
# | ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 )
330333
# | (FUNCTION & 0x7)
331-
# Extract domain, bus, device, function
332-
domain = (v_pci_id >> 32) & 0xFFFFFFFF
333-
bus = (v_pci_id >> 8) & 0xFF
334-
device_id = (v_pci_id >> 3) & 0x1F
335-
function = v_pci_id & 0x7
334+
domain = (bdfid >> 32) & 0xFFFFFFFF
335+
bus = (bdfid >> 8) & 0xFF
336+
device_id = (bdfid >> 3) & 0x1F
337+
function = bdfid & 0x7
336338
return f"{domain:04x}:{bus:02x}:{device_id:02x}.{function:x}"
337339

338340

0 commit comments

Comments
 (0)