Skip to content

Commit f4e0fd6

Browse files
committed
refactor: get topology
Signed-off-by: thxCode <[email protected]>
1 parent 0547d08 commit f4e0fd6

File tree

4 files changed

+325
-46
lines changed

4 files changed

+325
-46
lines changed

gpustack_runtime/detector/__utils__.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,7 @@ def map_cpu_affinity_to_numa_node(cpu_affinity: int | str | None) -> str:
713713
return ""
714714

715715
if isinstance(cpu_affinity, int):
716-
cpu_indices = bits_to_list(cpu_affinity)
716+
cpu_indices = bitmask_to_list(cpu_affinity)
717717
else:
718718
cpu_indices: list[int] = []
719719
for part in cpu_affinity.split(","):
@@ -762,7 +762,7 @@ def map_numa_node_to_cpu_affinity(numa_node: int | str | None) -> str:
762762
return ""
763763

764764
if isinstance(numa_node, int):
765-
numa_indices = bits_to_list(numa_node)
765+
numa_indices = bitmask_to_list(numa_node)
766766
else:
767767
numa_indices: list[int] = []
768768
for part in numa_node.split(","):
@@ -791,12 +791,12 @@ def map_numa_node_to_cpu_affinity(numa_node: int | str | None) -> str:
791791
return list_to_range_str(sorted(cpu_cores))
792792

793793

794-
def bits_to_list(bits: int, offset: int = 0) -> list[int]:
794+
def bitmask_to_list(bitmask: int, offset: int = 0) -> list[int]:
795795
"""
796796
Convert a bitmask to a list of set bit indices.
797797
798798
Args:
799-
bits:
799+
bitmask:
800800
The bitmask as an integer.
801801
offset:
802802
The offset to add to each index.
@@ -805,38 +805,32 @@ def bits_to_list(bits: int, offset: int = 0) -> list[int]:
805805
A list of indices where the bits are set to 1.
806806
807807
"""
808-
bits_len = bits.bit_length()
809-
indices = [offset + i for i in range(bits_len) if (bits >> i) & 1]
808+
bits_len = bitmask.bit_length()
809+
indices = [offset + i for i in range(bits_len) if (bitmask >> i) & 1]
810810
return indices
811811

812812

813-
def bits_to_str(bits: int, offset: int = 0, prefix: str = "") -> str:
813+
def bitmask_to_str(bitmask_list: list) -> str:
814814
"""
815815
Convert a bitmask to a comma-separated string of set bit indices.
816816
817817
Args:
818-
bits:
819-
The bitmask as an integer.
820-
offset:
821-
The offset to add to each index.
822-
prefix:
823-
The prefix to add to the string, separated by a comma.
818+
bitmask_list:
819+
An integer list stores each item in bitmask.
824820
825821
Returns:
826-
If the bitmask is 0, returns blank string.
827-
If the bits are contiguous, returns a range string (e.g., "2-5"),
822+
If a bitmask are contiguous, returns a range string (e.g., "2-5"),
828823
Otherwise, returns a comma-separated string of indices (e.g., "0,2-4").
829824
830825
"""
831-
bits_str = prefix
832-
833-
bits_list = bits_to_list(bits, offset)
834-
if bits_list:
835-
if bits_str:
836-
bits_str += ","
837-
bits_str += list_to_range_str(bits_list)
826+
bits_lists = []
827+
offset = 0
828+
for bitmask in bitmask_list:
829+
if bitmask != 0:
830+
bits_lists.extend(bitmask_to_list(bitmask, offset))
831+
offset += get_bits_size()
838832

839-
return bits_str
833+
return list_to_range_str(sorted(bits_lists))
840834

841835

842836
def list_to_range_str(indices: list[int]) -> str:

gpustack_runtime/detector/nvidia.py

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
1515
from .__utils__ import (
1616
PCIDevice,
17-
bits_to_str,
17+
bitmask_to_str,
1818
byte_to_mebibyte,
1919
get_brief_version,
2020
get_cpuset_size,
@@ -429,17 +429,9 @@ def get_topology(self, devices: Devices | None = None) -> Topology | None:
429429
dev_i_handle,
430430
get_cpuset_size(),
431431
)
432-
cpuset_bits_offset = 0
433-
for cpuset_bits in dev_i_cpuset:
434-
cpuset_bits_len = cpuset_bits.bit_length()
435-
if cpuset_bits != 0:
436-
cpuset_bits_str = bits_to_str(
437-
bits=cpuset_bits,
438-
offset=cpuset_bits_offset,
439-
prefix=topology.devices_cpu_affinities[i],
440-
)
441-
topology.devices_cpu_affinities[i] = cpuset_bits_str
442-
cpuset_bits_offset += cpuset_bits_len
432+
topology.devices_cpu_affinities[i] = bitmask_to_str(
433+
list(dev_i_cpuset),
434+
)
443435

444436
except pynvml.NVMLError:
445437
debug_log_exception(
@@ -455,17 +447,9 @@ def get_topology(self, devices: Devices | None = None) -> Topology | None:
455447
get_numa_nodeset_size(),
456448
pynvml.NVML_AFFINITY_SCOPE_NODE,
457449
)
458-
memset_bits_offset = 0
459-
for memset_bits in dev_i_memset:
460-
memset_bits_len = memset_bits.bit_length()
461-
if memset_bits != 0:
462-
memset_bits_str = bits_to_str(
463-
bits=memset_bits,
464-
offset=memset_bits_offset,
465-
prefix=topology.devices_numa_affinities[i],
466-
)
467-
topology.devices_numa_affinities[i] = memset_bits_str
468-
memset_bits_offset += memset_bits_len
450+
topology.devices_numa_affinities[i] = bitmask_to_str(
451+
list(dev_i_memset),
452+
)
469453
except pynvml.NVMLError:
470454
debug_log_exception(
471455
logger,
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
[
2+
{
3+
"manufacturer": "nvidia",
4+
"index": 0,
5+
"name": "NVIDIA H200",
6+
"uuid": "GPU-d0b737ba-bab2-9fc1-645b-a70606d5760c",
7+
"driver_version": "570.124.06",
8+
"runtime_version": "12.8",
9+
"runtime_version_original": "12.8.0",
10+
"compute_capability": "9.0",
11+
"cores": 132,
12+
"cores_utilization": 0,
13+
"memory": 143771,
14+
"memory_used": 607,
15+
"memory_utilization": 0.42,
16+
"temperature": 28,
17+
"power": 700,
18+
"power_used": 75,
19+
"appendix": {
20+
"arch_family": "Hopper",
21+
"vgpu": false,
22+
"fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
23+
"fabric_clique_id": 0
24+
}
25+
},
26+
{
27+
"manufacturer": "nvidia",
28+
"index": 1,
29+
"name": "NVIDIA H200",
30+
"uuid": "GPU-dd618a30-781a-55fc-17ff-7f88f83928e8",
31+
"driver_version": "570.124.06",
32+
"runtime_version": "12.8",
33+
"runtime_version_original": "12.8.0",
34+
"compute_capability": "9.0",
35+
"cores": 132,
36+
"cores_utilization": 0,
37+
"memory": 143771,
38+
"memory_used": 607,
39+
"memory_utilization": 0.42,
40+
"temperature": 29,
41+
"power": 700,
42+
"power_used": 74,
43+
"appendix": {
44+
"arch_family": "Hopper",
45+
"vgpu": false,
46+
"fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
47+
"fabric_clique_id": 0
48+
}
49+
},
50+
{
51+
"manufacturer": "nvidia",
52+
"index": 2,
53+
"name": "NVIDIA H200",
54+
"uuid": "GPU-27160e5c-2ae6-7ea4-46f7-c05e611a7601",
55+
"driver_version": "570.124.06",
56+
"runtime_version": "12.8",
57+
"runtime_version_original": "12.8.0",
58+
"compute_capability": "9.0",
59+
"cores": 132,
60+
"cores_utilization": 0,
61+
"memory": 143771,
62+
"memory_used": 607,
63+
"memory_utilization": 0.42,
64+
"temperature": 27,
65+
"power": 700,
66+
"power_used": 72,
67+
"appendix": {
68+
"arch_family": "Hopper",
69+
"vgpu": false,
70+
"fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
71+
"fabric_clique_id": 0
72+
}
73+
},
74+
{
75+
"manufacturer": "nvidia",
76+
"index": 3,
77+
"name": "NVIDIA H200",
78+
"uuid": "GPU-a9e99f6f-e2d6-ab57-6cac-cad9eb682ae9",
79+
"driver_version": "570.124.06",
80+
"runtime_version": "12.8",
81+
"runtime_version_original": "12.8.0",
82+
"compute_capability": "9.0",
83+
"cores": 132,
84+
"cores_utilization": 0,
85+
"memory": 143771,
86+
"memory_used": 607,
87+
"memory_utilization": 0.42,
88+
"temperature": 28,
89+
"power": 700,
90+
"power_used": 75,
91+
"appendix": {
92+
"arch_family": "Hopper",
93+
"vgpu": false,
94+
"fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
95+
"fabric_clique_id": 0
96+
}
97+
},
98+
{
99+
"manufacturer": "nvidia",
100+
"index": 4,
101+
"name": "NVIDIA H200",
102+
"uuid": "GPU-1a8a68a7-03a0-e120-721e-e734cf8834c4",
103+
"driver_version": "570.124.06",
104+
"runtime_version": "12.8",
105+
"runtime_version_original": "12.8.0",
106+
"compute_capability": "9.0",
107+
"cores": 132,
108+
"cores_utilization": 0,
109+
"memory": 143771,
110+
"memory_used": 607,
111+
"memory_utilization": 0.42,
112+
"temperature": 26,
113+
"power": 700,
114+
"power_used": 73,
115+
"appendix": {
116+
"arch_family": "Hopper",
117+
"vgpu": false,
118+
"fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
119+
"fabric_clique_id": 0
120+
}
121+
},
122+
{
123+
"manufacturer": "nvidia",
124+
"index": 5,
125+
"name": "NVIDIA H200",
126+
"uuid": "GPU-8492f058-dd55-f940-daef-f56e9656a338",
127+
"driver_version": "570.124.06",
128+
"runtime_version": "12.8",
129+
"runtime_version_original": "12.8.0",
130+
"compute_capability": "9.0",
131+
"cores": 132,
132+
"cores_utilization": 0,
133+
"memory": 143771,
134+
"memory_used": 607,
135+
"memory_utilization": 0.42,
136+
"temperature": 28,
137+
"power": 700,
138+
"power_used": 73,
139+
"appendix": {
140+
"arch_family": "Hopper",
141+
"vgpu": false,
142+
"fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
143+
"fabric_clique_id": 0
144+
}
145+
},
146+
{
147+
"manufacturer": "nvidia",
148+
"index": 6,
149+
"name": "NVIDIA H200",
150+
"uuid": "GPU-b3b1c2df-bd5c-daba-4f32-b99cd5daf467",
151+
"driver_version": "570.124.06",
152+
"runtime_version": "12.8",
153+
"runtime_version_original": "12.8.0",
154+
"compute_capability": "9.0",
155+
"cores": 132,
156+
"cores_utilization": 0,
157+
"memory": 143771,
158+
"memory_used": 607,
159+
"memory_utilization": 0.42,
160+
"temperature": 25,
161+
"power": 700,
162+
"power_used": 73,
163+
"appendix": {
164+
"arch_family": "Hopper",
165+
"vgpu": false,
166+
"fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
167+
"fabric_clique_id": 0
168+
}
169+
},
170+
{
171+
"manufacturer": "nvidia",
172+
"index": 7,
173+
"name": "NVIDIA H200",
174+
"uuid": "GPU-07eaf04c-26c1-2862-33d4-02a983f02166",
175+
"driver_version": "570.124.06",
176+
"runtime_version": "12.8",
177+
"runtime_version_original": "12.8.0",
178+
"compute_capability": "9.0",
179+
"cores": 132,
180+
"cores_utilization": 0,
181+
"memory": 143771,
182+
"memory_used": 607,
183+
"memory_utilization": 0.42,
184+
"temperature": 26,
185+
"power": 700,
186+
"power_used": 74,
187+
"appendix": {
188+
"arch_family": "Hopper",
189+
"vgpu": false,
190+
"fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
191+
"fabric_clique_id": 0
192+
}
193+
}
194+
]

0 commit comments

Comments
 (0)