Skip to content

Commit 08cd89c

Browse files
committed
refactor: enhance get topology
Signed-off-by: thxCode <[email protected]>
1 parent da78ea5 commit 08cd89c

File tree

8 files changed

+466
-89
lines changed

8 files changed

+466
-89
lines changed

gpustack_runtime/detector/amd.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -269,10 +269,9 @@ def get_topology(self, devices: Devices | None = None) -> Topology | None:
269269
if devices is None:
270270
return None
271271

272-
devices_count = len(devices)
273-
topology = Topology(
272+
ret = Topology(
274273
manufacturer=self.manufacturer,
275-
devices_count=devices_count,
274+
devices_count=len(devices),
276275
)
277276

278277
devs_mapping = None
@@ -319,42 +318,39 @@ def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
319318
for i, dev_i in enumerate(devices):
320319
# Get affinity with PCIe BDF if possible.
321320
if dev_i_bdf := dev_i.appendix.get("bdf", ""):
322-
topology.devices_numa_affinities[i] = get_numa_node_by_bdf(
321+
ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
323322
dev_i_bdf,
324323
)
325-
topology.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
326-
topology.devices_numa_affinities[i],
324+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
325+
ret.devices_numa_affinities[i],
327326
)
328327
# Otherwise, get affinity via AMD SMI.
329-
if not topology.devices_cpu_affinities[i]:
328+
if not ret.devices_cpu_affinities[i]:
330329
dev_i_handle = get_device_handle(dev_i)
331330

332331
# Get NUMA affinity.
333332
try:
334333
dev_i_numa_node = pyamdsmi.amdsmi_topo_get_numa_node_number(
335334
dev_i_handle,
336335
)
337-
topology.devices_numa_affinities[i] = str(dev_i_numa_node)
336+
ret.devices_numa_affinities[i] = str(dev_i_numa_node)
338337
except pyamdsmi.AmdSmiException:
339338
debug_log_exception(
340339
logger,
341340
"Failed to get NUMA affinity for device %d",
342341
dev_i.index,
343342
)
344343
# Get CPU affinity.
345-
topology.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
346-
topology.devices_numa_affinities[i],
344+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
345+
ret.devices_numa_affinities[i],
347346
)
348347

349348
# Get distances to other devices.
350349
for i, dev_i in enumerate(devices):
351350
dev_i_handle = get_device_handle(dev_i)
352351

353352
for j, dev_j in enumerate(devices):
354-
if (
355-
dev_i.index == dev_j.index
356-
or topology.devices_distances[i][j] != 0
357-
):
353+
if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
358354
continue
359355

360356
dev_j_handle = get_device_handle(dev_j)
@@ -372,8 +368,8 @@ def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
372368
distance = TopologyDistanceEnum.SELF
373369
case pyamdsmi.AMDSMI_LINK_TYPE_PCIE:
374370
dev_i_numa, dev_j_numa = (
375-
topology.devices_numa_affinities[i],
376-
topology.devices_numa_affinities[j],
371+
ret.devices_numa_affinities[i],
372+
ret.devices_numa_affinities[j],
377373
)
378374
if dev_i_numa and dev_i_numa == dev_j_numa:
379375
distance = distance_pci_devices(
@@ -395,8 +391,8 @@ def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
395391
dev_j.index,
396392
)
397393

398-
topology.devices_distances[i][j] = distance
399-
topology.devices_distances[j][i] = distance
394+
ret.devices_distances[i][j] = distance
395+
ret.devices_distances[j][i] = distance
400396
except pyamdsmi.AmdSmiException:
401397
debug_log_exception(logger, "Failed to fetch topology")
402398
raise
@@ -406,7 +402,7 @@ def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
406402
finally:
407403
pyamdsmi.amdsmi_shut_down()
408404

409-
return topology
405+
return ret
410406

411407

412408
def _get_arch_family(dev_family_id: int | None) -> str:

gpustack_runtime/detector/ascend.py

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
logger = logging.getLogger(__name__)
2929
slogger = logger.getChild("internal")
3030

31-
_TOPO_TYPE_DISTANCE_MAPPING: dict[int, int] = {
31+
_TOPOLOGY_DISTANCE_MAPPING: dict[int, int] = {
3232
pydcmi.DCMI_TOPO_TYPE_SELF: TopologyDistanceEnum.SELF,
3333
pydcmi.DCMI_TOPO_TYPE_HCCS: TopologyDistanceEnum.LINK, # Traversing via high-speed interconnect, RoCE, etc.
3434
pydcmi.DCMI_TOPO_TYPE_PIX: TopologyDistanceEnum.PIX, # Traversing via a single PCIe bridge.
@@ -261,10 +261,9 @@ def get_topology(self, devices: Devices | None = None) -> Topology | None:
261261
if devices is None:
262262
return None
263263

264-
devices_count = len(devices)
265-
topology = Topology(
264+
ret = Topology(
266265
manufacturer=self.manufacturer,
267-
devices_count=devices_count,
266+
devices_count=len(devices),
268267
)
269268

270269
try:
@@ -276,38 +275,35 @@ def get_topology(self, devices: Devices | None = None) -> Topology | None:
276275

277276
# Get affinity with PCIe BDF if possible.
278277
if dev_i_bdf := dev_i.appendix.get("bdf", ""):
279-
topology.devices_numa_affinities[i] = get_numa_node_by_bdf(
278+
ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
280279
dev_i_bdf,
281280
)
282-
topology.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
283-
topology.devices_numa_affinities[i],
281+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
282+
ret.devices_numa_affinities[i],
284283
)
285284
# Otherwise, get affinity via DCMI.
286-
if not topology.devices_cpu_affinities[i]:
285+
if not ret.devices_cpu_affinities[i]:
287286
# Get CPU affinity.
288287
try:
289288
cpu_affinity = pydcmi.dcmi_get_affinity_cpu_info_by_device_id(
290289
dev_i.appendix["card_id"],
291290
dev_i.appendix["device_id"],
292291
)
293-
topology.devices_cpu_affinities[i] = cpu_affinity
292+
ret.devices_cpu_affinities[i] = cpu_affinity
294293
except pydcmi.DCMIError:
295294
debug_log_exception(
296295
slogger,
297296
"Failed to get CPU affinity for device %d",
298297
dev_i.index,
299298
)
300299
# Get NUMA affinity.
301-
topology.devices_numa_affinities[i] = map_cpu_affinity_to_numa_node(
302-
topology.devices_cpu_affinities[i],
300+
ret.devices_numa_affinities[i] = map_cpu_affinity_to_numa_node(
301+
ret.devices_cpu_affinities[i],
303302
)
304303

305304
# Get distances to other devices.
306305
for j, dev_j in enumerate(devices):
307-
if (
308-
dev_i.index == dev_j.index
309-
or topology.devices_distances[i][j] != 0
310-
):
306+
if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
311307
continue
312308

313309
dev_j_card_id = dev_j.appendix["card_id"]
@@ -320,13 +316,13 @@ def get_topology(self, devices: Devices | None = None) -> Topology | None:
320316

321317
distance = TopologyDistanceEnum.UNK
322318
try:
323-
topo_type = pydcmi.dcmi_get_topo_info_by_device_id(
319+
topo = pydcmi.dcmi_get_topo_info_by_device_id(
324320
dev_i_card_id,
325321
dev_i_device_id,
326322
dev_j_card_id,
327323
dev_j_device_id,
328324
)
329-
distance = _TOPO_TYPE_DISTANCE_MAPPING.get(topo_type, distance)
325+
distance = _TOPOLOGY_DISTANCE_MAPPING.get(topo, distance)
330326
except pydcmi.DCMIError:
331327
debug_log_exception(
332328
slogger,
@@ -335,13 +331,13 @@ def get_topology(self, devices: Devices | None = None) -> Topology | None:
335331
dev_j.index,
336332
)
337333

338-
topology.devices_distances[i][j] = distance
339-
topology.devices_distances[j][i] = distance
334+
ret.devices_distances[i][j] = distance
335+
ret.devices_distances[j][i] = distance
340336
except Exception:
341337
debug_log_exception(logger, "Failed to process topology fetching")
342338
raise
343339

344-
return topology
340+
return ret
345341

346342

347343
def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:

gpustack_runtime/detector/hygon.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -214,10 +214,9 @@ def get_topology(self, devices: Devices | None = None) -> Topology | None:
214214
if devices is None:
215215
return None
216216

217-
devices_count = len(devices)
218-
topology = Topology(
217+
ret = Topology(
219218
manufacturer=self.manufacturer,
220-
devices_count=devices_count,
219+
devices_count=len(devices),
221220
)
222221

223222
try:
@@ -253,38 +252,35 @@ def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
253252
for i, dev_i in enumerate(devices):
254253
# Get affinity with PCIe BDF if possible.
255254
if dev_i_bdf := dev_i.appendix.get("bdf", ""):
256-
topology.devices_numa_affinities[i] = get_numa_node_by_bdf(
255+
ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
257256
dev_i_bdf,
258257
)
259-
topology.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
260-
topology.devices_numa_affinities[i],
258+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
259+
ret.devices_numa_affinities[i],
261260
)
262261
# Otherwise, get affinity via ROCM SMI.
263-
if not topology.devices_numa_affinities[i]:
262+
if not ret.devices_numa_affinities[i]:
264263
# Get NUMA affinity.
265264
try:
266265
dev_i_numa_node = pyrocmsmi.rsmi_topo_get_numa_node_number(
267266
dev_i.index,
268267
)
269-
topology.devices_numa_affinities[i] = str(dev_i_numa_node)
268+
ret.devices_numa_affinities[i] = str(dev_i_numa_node)
270269
except pyrocmsmi.ROCMSMIError:
271270
debug_log_exception(
272271
logger,
273272
"Failed to get NUMA affinity for device %d",
274273
dev_i.index,
275274
)
276275
# Get CPU affinity.
277-
topology.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
278-
topology.devices_numa_affinities[i],
276+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
277+
ret.devices_numa_affinities[i],
279278
)
280279

281280
# Get distances to other devices.
282281
for i, dev_i in enumerate(devices):
283282
for j, dev_j in enumerate(devices):
284-
if (
285-
dev_i.index == dev_j.index
286-
or topology.devices_distances[i][j] != 0
287-
):
283+
if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
288284
continue
289285

290286
distance = TopologyDistanceEnum.UNK
@@ -300,8 +296,8 @@ def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
300296
distance = TopologyDistanceEnum.LINK
301297
case pyrocmsmi.ROCMSMI_IOLINK_TYPE_PCIE:
302298
dev_i_numa, dev_j_numa = (
303-
topology.devices_numa_affinities[i],
304-
topology.devices_numa_affinities[j],
299+
ret.devices_numa_affinities[i],
300+
ret.devices_numa_affinities[j],
305301
)
306302
if dev_i_numa and dev_i_numa == dev_j_numa:
307303
distance = distance_pci_devices(
@@ -323,16 +319,16 @@ def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
323319
dev_j.index,
324320
)
325321

326-
topology.devices_distances[i][j] = distance
327-
topology.devices_distances[j][i] = distance
322+
ret.devices_distances[i][j] = distance
323+
ret.devices_distances[j][i] = distance
328324
except pyrocmsmi.ROCMSMIError:
329325
debug_log_exception(logger, "Failed to fetch topology")
330326
raise
331327
except Exception:
332328
debug_log_exception(logger, "Failed to process topology fetching")
333329
raise
334330

335-
return topology
331+
return ret
336332

337333

338334
def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:

0 commit comments

Comments
 (0)