Skip to content

Commit c9ba6ee

Browse files
committed
feat: get topology
Signed-off-by: thxCode <[email protected]>
1 parent a9796bc commit c9ba6ee

File tree

10 files changed

+1033
-28
lines changed

10 files changed

+1033
-28
lines changed

gpustack_runtime/__main__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
DeleteWorkloadSubCommand,
1919
DetectDevicesSubCommand,
2020
ExecWorkloadSubCommand,
21+
GetDevicesTopologySubCommand,
2122
GetWorkloadSubCommand,
2223
InspectWorkloadSubCommand,
2324
ListImagesSubCommand,
@@ -68,6 +69,7 @@ def main():
6869
ExecWorkloadSubCommand.register(subcommand_parser)
6970
InspectWorkloadSubCommand.register(subcommand_parser)
7071
DetectDevicesSubCommand.register(subcommand_parser)
72+
GetDevicesTopologySubCommand.register(subcommand_parser)
7173
ListImagesSubCommand.register(subcommand_parser)
7274
SaveImagesSubCommand.register(subcommand_parser)
7375
CopyImagesSubCommand.register(subcommand_parser)

gpustack_runtime/cmds/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
ListWorkloadsSubCommand,
1212
LogsWorkloadSubCommand,
1313
)
14-
from .detector import DetectDevicesSubCommand
14+
from .detector import DetectDevicesSubCommand, GetDevicesTopologySubCommand
1515
from .images import (
1616
CopyImagesSubCommand,
1717
ListImagesSubCommand,
@@ -29,6 +29,7 @@
2929
"DeleteWorkloadsSubCommand",
3030
"DetectDevicesSubCommand",
3131
"ExecWorkloadSubCommand",
32+
"GetDevicesTopologySubCommand",
3233
"GetWorkloadSubCommand",
3334
"InspectWorkloadSubCommand",
3435
"ListImagesSubCommand",

gpustack_runtime/cmds/detector.py

Lines changed: 148 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,13 @@
55
import time
66
from typing import TYPE_CHECKING
77

8-
from ..detector import Devices, detect_devices
8+
from ..detector import (
9+
Devices,
10+
Topology,
11+
detect_devices,
12+
get_devices_topologies,
13+
group_devices_by_manufacturer,
14+
)
915
from .__types__ import SubCommand
1016

1117
if TYPE_CHECKING:
@@ -49,27 +55,67 @@ def __init__(self, args: Namespace):
4955
self.watch = args.watch
5056

5157
def run(self):
52-
try:
53-
while True:
54-
devs: Devices = detect_devices(fast=False)
55-
print("\033[2J\033[H", end="")
56-
match self.format.lower():
57-
case "json":
58-
print(format_devices_json(devs))
59-
case _:
60-
# Group devs by manufacturer
61-
grouped_devs: dict[str, Devices] = {}
62-
for dev in devs:
63-
if dev.manufacturer not in grouped_devs:
64-
grouped_devs[dev.manufacturer] = []
65-
grouped_devs[dev.manufacturer].append(dev)
66-
for manu in sorted(grouped_devs.keys()):
67-
print(format_devices_table(grouped_devs[manu]))
68-
if not self.watch:
69-
break
70-
time.sleep(self.watch)
71-
except KeyboardInterrupt:
58+
while True:
59+
devs: Devices = detect_devices(fast=False)
7260
print("\033[2J\033[H", end="")
61+
match self.format.lower():
62+
case "json":
63+
print(format_devices_json(devs))
64+
case _:
65+
# Group devices by manufacturer.
66+
group_devs = group_devices_by_manufacturer(devs)
67+
if not group_devs:
68+
print("No GPUs detected.")
69+
else:
70+
# Print each group separately.
71+
for devs in group_devs.values():
72+
print(format_devices_table(devs))
73+
if not self.watch:
74+
break
75+
time.sleep(self.watch)
76+
77+
78+
class GetDevicesTopologySubCommand(SubCommand):
79+
"""
80+
Command to detect GPUs topology.
81+
"""
82+
83+
format: str = "table"
84+
85+
@staticmethod
86+
def register(parser: _SubParsersAction):
87+
topo_parser = parser.add_parser(
88+
"topology",
89+
help="Detect GPUs topology",
90+
aliases=["topo"],
91+
)
92+
93+
topo_parser.add_argument(
94+
"--format",
95+
type=str,
96+
choices=["table", "json"],
97+
default="table",
98+
help="Output format",
99+
)
100+
101+
topo_parser.set_defaults(func=GetDevicesTopologySubCommand)
102+
103+
def __init__(self, args: Namespace):
104+
self.format = args.format
105+
106+
def run(self):
107+
topologies = get_devices_topologies(fast=False)
108+
print("\033[2J\033[H", end="")
109+
if not topologies:
110+
print("No GPU topology information available.")
111+
return
112+
113+
match self.format.lower():
114+
case "json":
115+
print(format_topologies_json(topologies))
116+
case _:
117+
for topo in topologies:
118+
print(format_topology_table(topo))
73119

74120

75121
def format_devices_json(devs: Devices) -> str:
@@ -151,3 +197,84 @@ def format_devices_table(devs: Devices) -> str:
151197

152198
# Combine all parts
153199
return os.linesep.join(header_lines + device_lines + footer_lines)
200+
201+
202+
def format_topology_table(topo: Topology) -> str:
203+
content = topo.stringify()
204+
205+
# Column headers
206+
col_headers = [str(topo.manufacturer).upper()]
207+
col_headers += ["Device " + str(idx) for idx in range(len(content))]
208+
col_headers += [
209+
"CPU Affinity",
210+
"NUMA Affinity",
211+
]
212+
# Gather all rows to determine max width for each column
213+
rows = []
214+
for row_idx, row_devs in enumerate(content):
215+
row = ["Device " + str(row_idx), *row_devs]
216+
rows.append(row)
217+
218+
# Calculate max width for each column
219+
col_widths = [len(header) for header in col_headers]
220+
for row in rows:
221+
for i, cell in enumerate(row):
222+
col_widths[i] = max(col_widths[i], len(str(cell)))
223+
224+
# Add padding
225+
col_widths = [w + 2 for w in col_widths]
226+
227+
# Calculate table width
228+
width = sum(col_widths) + len(col_widths) + 1
229+
230+
# Header section
231+
header_lines = [
232+
"+" + "-" * (width - 2) + "+",
233+
]
234+
235+
# Column header line
236+
col_header_line = "|"
237+
for i, header in enumerate(col_headers):
238+
col_header_line += f" {header.center(col_widths[i] - 2)} |"
239+
header_lines.append(col_header_line)
240+
241+
# Separator line
242+
separator = "|" + "|".join(["-" * w for w in col_widths]) + "|"
243+
header_lines.append(separator)
244+
245+
# Topology rows
246+
topology_lines = []
247+
for row in rows:
248+
row_line = "|"
249+
for j, data in enumerate(row):
250+
cell = str(data)
251+
# Truncate if too long
252+
if len(cell) > col_widths[j] - 2:
253+
cell = cell[: col_widths[j] - 5] + "..."
254+
row_line += f" {cell.ljust(col_widths[j] - 2)} |"
255+
topology_lines.append(row_line)
256+
257+
# Footer section
258+
footer_lines = [
259+
"+" + "-" * (width - 2) + "+",
260+
]
261+
262+
# Legend
263+
legend_lines = [
264+
"",
265+
"Legend (from nearest to farthest):",
266+
" X = Self",
267+
" LINK = Connection traversing with High-Speed Link (e.g., NVIDIA NVLink, Ascend HCCS)",
268+
" PIX = Connection traversing at most a single PCIe bridge",
269+
" PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)",
270+
" PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)",
271+
" NODE = Connection traversing PCIe and the interconnect between NUMA nodes",
272+
" SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)",
273+
]
274+
275+
# Combine all parts
276+
return os.linesep.join(header_lines + topology_lines + footer_lines + legend_lines)
277+
278+
279+
def format_topologies_json(topologies: list[Topology]) -> str:
280+
return json.dumps([topo.to_dict() for topo in topologies], indent=2)

gpustack_runtime/deployer/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@
5151
DockerDeployer(),
5252
KubernetesDeployer(),
5353
]
54+
"""
55+
List of all deployers.
56+
"""
57+
58+
_DEPLOYERS_MAP: dict[str, Deployer] = {dep.name: dep for dep in _DEPLOYERS}
59+
"""
60+
Mapping from deployer name to deployer.
61+
"""
5462

5563

5664
def supported_list() -> list[Deployer]:

gpustack_runtime/detector/__init__.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99
Device,
1010
Devices,
1111
ManufacturerEnum,
12+
Topology,
1213
backend_to_manufacturer,
1314
manufacturer_to_backend,
15+
reduce_devices_distances,
1416
supported_backends,
1517
supported_manufacturers,
1618
)
@@ -35,6 +37,16 @@
3537
MThreadsDetector(),
3638
NVIDIADetector(),
3739
]
40+
"""
41+
List of all detectors.
42+
"""
43+
44+
_DETECTORS_MAP: dict[ManufacturerEnum, Detector] = {
45+
det.manufacturer: det for det in _DETECTORS
46+
}
47+
"""
48+
Mapping from manufacturer to detector.
49+
"""
3850

3951

4052
def supported_list() -> list[Detector]:
@@ -113,14 +125,82 @@ def detect_devices(fast: bool = True) -> Devices:
113125
return devices
114126

115127

128+
def get_devices_topologies(
129+
devices: Devices | None = None,
130+
fast: bool = True,
131+
) -> list[Topology]:
132+
"""
133+
Get the topology information of the given devices.
134+
135+
Args:
136+
devices:
137+
A list of devices to get the topology information from.
138+
If None, detects devices automatically.
139+
fast:
140+
If True, return topologies from the first supported detector.
141+
Otherwise, return topologies from all supported detectors.
142+
143+
Returns:
144+
A list of Topology objects for each manufacturer group.
145+
146+
"""
147+
if devices is None:
148+
devices = detect_devices(fast=fast)
149+
150+
topologies: list[Topology] = []
151+
152+
# Group devices by manufacturer.
153+
group_devices = group_devices_by_manufacturer(devices)
154+
if not group_devices:
155+
return topologies
156+
157+
# Get topology for each group.
158+
for manu, devs in group_devices.items():
159+
det = _DETECTORS_MAP.get(manu)
160+
if det is not None:
161+
topo = det.get_topology(devs)
162+
if topo:
163+
topologies.append(topo)
164+
if fast and topologies:
165+
return topologies
166+
167+
return topologies
168+
169+
170+
def group_devices_by_manufacturer(
171+
devices: Devices | None,
172+
) -> dict[ManufacturerEnum, Devices]:
173+
"""
174+
Group devices by their manufacturer.
175+
176+
Args:
177+
devices:
178+
A list of devices to be grouped.
179+
180+
Returns:
181+
A dictionary mapping each manufacturer to its corresponding list of devices.
182+
183+
"""
184+
group_devices: dict[ManufacturerEnum, Devices] = {}
185+
for dev in devices or []:
186+
if dev.manufacturer not in group_devices:
187+
group_devices[dev.manufacturer] = []
188+
group_devices[dev.manufacturer].append(dev)
189+
return group_devices
190+
191+
116192
__all__ = [
117193
"Device",
118194
"Devices",
119195
"ManufacturerEnum",
196+
"Topology",
120197
"backend_to_manufacturer",
121198
"detect_backend",
122199
"detect_devices",
200+
"get_devices_topologies",
201+
"group_devices_by_manufacturer",
123202
"manufacturer_to_backend",
203+
"reduce_devices_distances",
124204
"supported_backends",
125205
"supported_list",
126206
"supported_manufacturers",

0 commit comments

Comments
 (0)