Skip to content

Commit 4d61630

Browse files
committed
refactor: get topology
with test modification Signed-off-by: thxCode <[email protected]>
1 parent 21162d2 commit 4d61630

32 files changed

+611
-415
lines changed

gpustack_runtime/cmds/detector.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ def run(self):
128128
" PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)",
129129
" NODE = Connection traversing PCIe and the interconnect between NUMA nodes",
130130
" SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)",
131+
" N/A = Unknown or unavailable information",
131132
]
132133
print(os.linesep.join(legend_lines))
133134

gpustack_runtime/detector/__types__.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -303,9 +303,9 @@ class TopologyDistanceEnum(int, Enum):
303303
Enum for Topology Distance Levels.
304304
"""
305305

306-
INTERNAL = 0
306+
SELF = 0
307307
"""
308-
Same device.
308+
Self connection.
309309
"""
310310
LINK = 5
311311
"""
@@ -331,7 +331,7 @@ class TopologyDistanceEnum(int, Enum):
331331
"""
332332
Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI).
333333
"""
334-
UNKNOWN = 100
334+
UNK = 100
335335
"""
336336
Unknown connection.
337337
"""
@@ -350,19 +350,19 @@ def stringify_devices_distance(distance: int) -> str:
350350
351351
"""
352352
match distance:
353-
case 0:
353+
case TopologyDistanceEnum.SELF:
354354
return "X"
355-
case 5:
355+
case TopologyDistanceEnum.LINK:
356356
return "LINK"
357-
case 10:
357+
case TopologyDistanceEnum.PIX:
358358
return "PIX"
359-
case 20:
359+
case TopologyDistanceEnum.PXB:
360360
return "PXB"
361-
case 30:
361+
case TopologyDistanceEnum.PHB:
362362
return "PHB"
363-
case 40:
363+
case TopologyDistanceEnum.NODE:
364364
return "NODE"
365-
case 50:
365+
case TopologyDistanceEnum.SYS:
366366
return "SYS"
367367
case _:
368368
return "N/A"
@@ -450,7 +450,7 @@ def detect(self) -> Devices | None:
450450
"""
451451
raise NotImplementedError
452452

453-
def get_topology(self, devices: Devices | None) -> Topology | None: # noqa: ARG002
453+
def get_topology(self, devices: Devices | None = None) -> Topology | None: # noqa: ARG002
454454
"""
455455
Get the Topology object between the given devices.
456456

gpustack_runtime/detector/amd.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def detect(self) -> Devices | None:
250250

251251
return ret
252252

253-
def get_topology(self, devices: Devices | None) -> Topology | None:
253+
def get_topology(self, devices: Devices | None = None) -> Topology | None:
254254
"""
255255
Get the Topology object between AMD GPUs.
256256
@@ -320,15 +320,15 @@ def get_device_handle(dev: Device):
320320

321321
dev_j_handle = get_device_handle(dev_j)
322322

323-
distance = TopologyDistanceEnum.UNKNOWN
323+
distance = TopologyDistanceEnum.UNK
324324
try:
325325
link_type = pyamdsmi.amdsmi_topo_get_link_type(
326326
dev_i_handle,
327327
dev_j_handle,
328328
)
329329
match int(link_type.type):
330330
case pyamdsmi.AMDSMI_LINK_TYPE_INTERNAL:
331-
distance = TopologyDistanceEnum.INTERNAL
331+
distance = TopologyDistanceEnum.SELF
332332
case pyamdsmi.AMDSMI_LINK_TYPE_XGMI:
333333
distance = TopologyDistanceEnum.LINK
334334
# For PCIe links,

gpustack_runtime/detector/ascend.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
slogger = logger.getChild("internal")
2828

2929
_TOPO_TYPE_DISTANCE_MAPPING: dict[int, int] = {
30-
pydcmi.DCMI_TOPO_TYPE_SELF: TopologyDistanceEnum.INTERNAL,
30+
pydcmi.DCMI_TOPO_TYPE_SELF: TopologyDistanceEnum.SELF,
3131
pydcmi.DCMI_TOPO_TYPE_HCCS: TopologyDistanceEnum.LINK, # Traversing via high-speed interconnect, RoCE, etc.
3232
pydcmi.DCMI_TOPO_TYPE_PIX: TopologyDistanceEnum.PIX, # Traversing via a single PCIe bridge.
3333
pydcmi.DCMI_TOPO_TYPE_PXB: TopologyDistanceEnum.PXB, # Traversing via multiple PCIe bridges without PCIe Host Bridge.
@@ -111,6 +111,9 @@ def detect(self) -> Devices | None:
111111

112112
_, card_list = pydcmi.dcmi_get_card_list()
113113
for dev_card_id in card_list:
114+
device_id_max_in_card, _, _ = pydcmi.dcmi_get_device_id_in_card(
115+
dev_card_id,
116+
)
114117
device_num_in_card = pydcmi.dcmi_get_device_num_in_card(dev_card_id)
115118
for dev_device_id in range(device_num_in_card):
116119
dev_is_vgpu = False
@@ -190,6 +193,7 @@ def detect(self) -> Devices | None:
190193
"vgpu": dev_is_vgpu,
191194
"card_id": dev_card_id,
192195
"device_id": dev_device_id,
196+
"device_id_max": device_id_max_in_card - 1,
193197
}
194198

195199
dev_roce_ip, dev_roce_mask, dev_roce_gateway = (
@@ -233,7 +237,7 @@ def detect(self) -> Devices | None:
233237

234238
return ret
235239

236-
def get_topology(self, devices: Devices | None) -> Topology | None:
240+
def get_topology(self, devices: Devices | None = None) -> Topology | None:
237241
"""
238242
Get the Topology object between Ascend NPUs.
239243
@@ -258,6 +262,8 @@ def get_topology(self, devices: Devices | None) -> Topology | None:
258262
)
259263

260264
try:
265+
pydcmi.dcmi_init()
266+
261267
for i, dev_i in enumerate(devices):
262268
# Get CPU affinity.
263269
try:
@@ -285,7 +291,7 @@ def get_topology(self, devices: Devices | None) -> Topology | None:
285291
if topology.devices_distances[i][j] != 0:
286292
continue
287293

288-
distance = TopologyDistanceEnum.UNKNOWN
294+
distance = TopologyDistanceEnum.UNK
289295
try:
290296
topo_type = pydcmi.dcmi_get_topo_info_by_device_id(
291297
dev_i.appendix["card_id"],

gpustack_runtime/detector/nvidia.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ def detect(self) -> Devices | None: # noqa: PLR0915
393393

394394
return ret
395395

396-
def get_topology(self, devices: Devices | None) -> Topology | None:
396+
def get_topology(self, devices: Devices | None = None) -> Topology | None:
397397
"""
398398
Get the Topology object between NVIDIA GPUs.
399399
@@ -485,7 +485,7 @@ def get_topology(self, devices: Devices | None) -> Topology | None:
485485

486486
dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
487487

488-
distance = TopologyDistanceEnum.UNKNOWN
488+
distance = TopologyDistanceEnum.UNK
489489
try:
490490
distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
491491
dev_i_handle,

tests/gpustack_runtime/detector/fixtures/output_amd_rx7800xt.json

Lines changed: 0 additions & 24 deletions
This file was deleted.

tests/gpustack_runtime/detector/fixtures/output_ascend_310p3.json

Lines changed: 0 additions & 98 deletions
This file was deleted.

tests/gpustack_runtime/detector/fixtures/output_ascend_910b1.json

Lines changed: 0 additions & 56 deletions
This file was deleted.

0 commit comments

Comments
 (0)