refactor: get topology

thxCode · thxCode · commit f4e0fd691173 · 2025-12-24T18:16:08.000+08:00
Signed-off-by: thxCode &lt;thxcode0824@gmail.com&gt;
diff --git a/gpustack_runtime/detector/__utils__.py b/gpustack_runtime/detector/__utils__.py
@@ -713,7 +713,7 @@ def map_cpu_affinity_to_numa_node(cpu_affinity: int | str | None) -> str:
         return ""
 
     if isinstance(cpu_affinity, int):
-        cpu_indices = bits_to_list(cpu_affinity)
+        cpu_indices = bitmask_to_list(cpu_affinity)
     else:
         cpu_indices: list[int] = []
         for part in cpu_affinity.split(","):
@@ -762,7 +762,7 @@ def map_numa_node_to_cpu_affinity(numa_node: int | str | None) -> str:
         return ""
 
     if isinstance(numa_node, int):
-        numa_indices = bits_to_list(numa_node)
+        numa_indices = bitmask_to_list(numa_node)
     else:
         numa_indices: list[int] = []
         for part in numa_node.split(","):
@@ -791,12 +791,12 @@ def map_numa_node_to_cpu_affinity(numa_node: int | str | None) -> str:
     return list_to_range_str(sorted(cpu_cores))
 
 
-def bits_to_list(bits: int, offset: int = 0) -> list[int]:
+def bitmask_to_list(bitmask: int, offset: int = 0) -> list[int]:
     """
     Convert a bitmask to a list of set bit indices.
 
     Args:
-        bits:
+        bitmask:
             The bitmask as an integer.
         offset:
             The offset to add to each index.
@@ -805,38 +805,32 @@ def bits_to_list(bits: int, offset: int = 0) -> list[int]:
         A list of indices where the bits are set to 1.
 
     """
-    bits_len = bits.bit_length()
-    indices = [offset + i for i in range(bits_len) if (bits >> i) & 1]
+    bits_len = bitmask.bit_length()
+    indices = [offset + i for i in range(bits_len) if (bitmask >> i) & 1]
     return indices
 
 
-def bits_to_str(bits: int, offset: int = 0, prefix: str = "") -> str:
+def bitmask_to_str(bitmask_list: list) -> str:
     """
     Convert a bitmask to a comma-separated string of set bit indices.
 
     Args:
-        bits:
-            The bitmask as an integer.
-        offset:
-            The offset to add to each index.
-        prefix:
-            The prefix to add to the string, separated by a comma.
+        bitmask_list:
+            An integer list stores each item in bitmask.
 
     Returns:
-        If the bitmask is 0, returns blank string.
-        If the bits are contiguous, returns a range string (e.g., "2-5"),
+        If a bitmask are contiguous, returns a range string (e.g., "2-5"),
         Otherwise, returns a comma-separated string of indices (e.g., "0,2-4").
 
     """
-    bits_str = prefix
-
-    bits_list = bits_to_list(bits, offset)
-    if bits_list:
-        if bits_str:
-            bits_str += ","
-        bits_str += list_to_range_str(bits_list)
+    bits_lists = []
+    offset = 0
+    for bitmask in bitmask_list:
+        if bitmask != 0:
+            bits_lists.extend(bitmask_to_list(bitmask, offset))
+        offset += get_bits_size()
 
-    return bits_str
+    return list_to_range_str(sorted(bits_lists))
 
 
 def list_to_range_str(indices: list[int]) -> str:
diff --git a/gpustack_runtime/detector/nvidia.py b/gpustack_runtime/detector/nvidia.py
@@ -14,7 +14,7 @@
 from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
 from .__utils__ import (
     PCIDevice,
-    bits_to_str,
+    bitmask_to_str,
     byte_to_mebibyte,
     get_brief_version,
     get_cpuset_size,
@@ -429,17 +429,9 @@ def get_topology(self, devices: Devices | None = None) -> Topology | None:
                         dev_i_handle,
                         get_cpuset_size(),
                     )
-                    cpuset_bits_offset = 0
-                    for cpuset_bits in dev_i_cpuset:
-                        cpuset_bits_len = cpuset_bits.bit_length()
-                        if cpuset_bits != 0:
-                            cpuset_bits_str = bits_to_str(
-                                bits=cpuset_bits,
-                                offset=cpuset_bits_offset,
-                                prefix=topology.devices_cpu_affinities[i],
-                            )
-                            topology.devices_cpu_affinities[i] = cpuset_bits_str
-                        cpuset_bits_offset += cpuset_bits_len
+                    topology.devices_cpu_affinities[i] = bitmask_to_str(
+                        list(dev_i_cpuset),
+                    )
 
                 except pynvml.NVMLError:
                     debug_log_exception(
@@ -455,17 +447,9 @@ def get_topology(self, devices: Devices | None = None) -> Topology | None:
                         get_numa_nodeset_size(),
                         pynvml.NVML_AFFINITY_SCOPE_NODE,
                     )
-                    memset_bits_offset = 0
-                    for memset_bits in dev_i_memset:
-                        memset_bits_len = memset_bits.bit_length()
-                        if memset_bits != 0:
-                            memset_bits_str = bits_to_str(
-                                bits=memset_bits,
-                                offset=memset_bits_offset,
-                                prefix=topology.devices_numa_affinities[i],
-                            )
-                            topology.devices_numa_affinities[i] = memset_bits_str
-                        memset_bits_offset += memset_bits_len
+                    topology.devices_numa_affinities[i] = bitmask_to_str(
+                        list(dev_i_memset),
+                    )
                 except pynvml.NVMLError:
                     debug_log_exception(
                         logger,
diff --git a/tests/gpustack_runtime/detector/samples/detect_output_nvidia_h200.json b/tests/gpustack_runtime/detector/samples/detect_output_nvidia_h200.json
@@ -0,0 +1,194 @@
+[
+  {
+    "manufacturer": "nvidia",
+    "index": 0,
+    "name": "NVIDIA H200",
+    "uuid": "GPU-d0b737ba-bab2-9fc1-645b-a70606d5760c",
+    "driver_version": "570.124.06",
+    "runtime_version": "12.8",
+    "runtime_version_original": "12.8.0",
+    "compute_capability": "9.0",
+    "cores": 132,
+    "cores_utilization": 0,
+    "memory": 143771,
+    "memory_used": 607,
+    "memory_utilization": 0.42,
+    "temperature": 28,
+    "power": 700,
+    "power_used": 75,
+    "appendix": {
+      "arch_family": "Hopper",
+      "vgpu": false,
+      "fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
+      "fabric_clique_id": 0
+    }
+  },
+  {
+    "manufacturer": "nvidia",
+    "index": 1,
+    "name": "NVIDIA H200",
+    "uuid": "GPU-dd618a30-781a-55fc-17ff-7f88f83928e8",
+    "driver_version": "570.124.06",
+    "runtime_version": "12.8",
+    "runtime_version_original": "12.8.0",
+    "compute_capability": "9.0",
+    "cores": 132,
+    "cores_utilization": 0,
+    "memory": 143771,
+    "memory_used": 607,
+    "memory_utilization": 0.42,
+    "temperature": 29,
+    "power": 700,
+    "power_used": 74,
+    "appendix": {
+      "arch_family": "Hopper",
+      "vgpu": false,
+      "fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
+      "fabric_clique_id": 0
+    }
+  },
+  {
+    "manufacturer": "nvidia",
+    "index": 2,
+    "name": "NVIDIA H200",
+    "uuid": "GPU-27160e5c-2ae6-7ea4-46f7-c05e611a7601",
+    "driver_version": "570.124.06",
+    "runtime_version": "12.8",
+    "runtime_version_original": "12.8.0",
+    "compute_capability": "9.0",
+    "cores": 132,
+    "cores_utilization": 0,
+    "memory": 143771,
+    "memory_used": 607,
+    "memory_utilization": 0.42,
+    "temperature": 27,
+    "power": 700,
+    "power_used": 72,
+    "appendix": {
+      "arch_family": "Hopper",
+      "vgpu": false,
+      "fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
+      "fabric_clique_id": 0
+    }
+  },
+  {
+    "manufacturer": "nvidia",
+    "index": 3,
+    "name": "NVIDIA H200",
+    "uuid": "GPU-a9e99f6f-e2d6-ab57-6cac-cad9eb682ae9",
+    "driver_version": "570.124.06",
+    "runtime_version": "12.8",
+    "runtime_version_original": "12.8.0",
+    "compute_capability": "9.0",
+    "cores": 132,
+    "cores_utilization": 0,
+    "memory": 143771,
+    "memory_used": 607,
+    "memory_utilization": 0.42,
+    "temperature": 28,
+    "power": 700,
+    "power_used": 75,
+    "appendix": {
+      "arch_family": "Hopper",
+      "vgpu": false,
+      "fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
+      "fabric_clique_id": 0
+    }
+  },
+  {
+    "manufacturer": "nvidia",
+    "index": 4,
+    "name": "NVIDIA H200",
+    "uuid": "GPU-1a8a68a7-03a0-e120-721e-e734cf8834c4",
+    "driver_version": "570.124.06",
+    "runtime_version": "12.8",
+    "runtime_version_original": "12.8.0",
+    "compute_capability": "9.0",
+    "cores": 132,
+    "cores_utilization": 0,
+    "memory": 143771,
+    "memory_used": 607,
+    "memory_utilization": 0.42,
+    "temperature": 26,
+    "power": 700,
+    "power_used": 73,
+    "appendix": {
+      "arch_family": "Hopper",
+      "vgpu": false,
+      "fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
+      "fabric_clique_id": 0
+    }
+  },
+  {
+    "manufacturer": "nvidia",
+    "index": 5,
+    "name": "NVIDIA H200",
+    "uuid": "GPU-8492f058-dd55-f940-daef-f56e9656a338",
+    "driver_version": "570.124.06",
+    "runtime_version": "12.8",
+    "runtime_version_original": "12.8.0",
+    "compute_capability": "9.0",
+    "cores": 132,
+    "cores_utilization": 0,
+    "memory": 143771,
+    "memory_used": 607,
+    "memory_utilization": 0.42,
+    "temperature": 28,
+    "power": 700,
+    "power_used": 73,
+    "appendix": {
+      "arch_family": "Hopper",
+      "vgpu": false,
+      "fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
+      "fabric_clique_id": 0
+    }
+  },
+  {
+    "manufacturer": "nvidia",
+    "index": 6,
+    "name": "NVIDIA H200",
+    "uuid": "GPU-b3b1c2df-bd5c-daba-4f32-b99cd5daf467",
+    "driver_version": "570.124.06",
+    "runtime_version": "12.8",
+    "runtime_version_original": "12.8.0",
+    "compute_capability": "9.0",
+    "cores": 132,
+    "cores_utilization": 0,
+    "memory": 143771,
+    "memory_used": 607,
+    "memory_utilization": 0.42,
+    "temperature": 25,
+    "power": 700,
+    "power_used": 73,
+    "appendix": {
+      "arch_family": "Hopper",
+      "vgpu": false,
+      "fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
+      "fabric_clique_id": 0
+    }
+  },
+  {
+    "manufacturer": "nvidia",
+    "index": 7,
+    "name": "NVIDIA H200",
+    "uuid": "GPU-07eaf04c-26c1-2862-33d4-02a983f02166",
+    "driver_version": "570.124.06",
+    "runtime_version": "12.8",
+    "runtime_version_original": "12.8.0",
+    "compute_capability": "9.0",
+    "cores": 132,
+    "cores_utilization": 0,
+    "memory": 143771,
+    "memory_used": 607,
+    "memory_utilization": 0.42,
+    "temperature": 26,
+    "power": 700,
+    "power_used": 74,
+    "appendix": {
+      "arch_family": "Hopper",
+      "vgpu": false,
+      "fabric_cluster_uuid": "00000000-0000-0000-0000-000000000000",
+      "fabric_clique_id": 0
+    }
+  }
+]
diff --git a/tests/gpustack_runtime/detector/samples/topology_output_nvidia_h200.json b/tests/gpustack_runtime/detector/samples/topology_output_nvidia_h200.json