gpu: make shared assignment class

vsoch · vsoch · commit 66e0f21bc4a0 · 2025-10-23T18:26:14.000-06:00
Signed-off-by: vsoch &lt;vsoch@users.noreply.github.com&gt;
diff --git a/fluxbind/shape/gpu.py b/fluxbind/shape/gpu.py
@@ -1,42 +1,47 @@
 from dataclasses import dataclass
 
-
 @dataclass
 class GPUAssignment:
     """
-    Data structure to hold information about a rank's assigned GPU.
+    A data structure to hold information about a rank's assigned GPU(s).
+    Instances are created via the for_rank() classmethod.
     """
-
-    indices: list[int]  # logical index of the GPU
-    pci_ids: list[str]  # The PCI bus ID of the GPU
-    cuda_devices: str  # CUDA_VISIBLE_DEVICES
+    indices: list[int]          # The logical indices in the ordered list (e.g., [4, 5])
+    pci_ids: list[str]          # The corresponding PCI bus IDs of the GPUs
+    numa_indices: set[int]      # The set of unique NUMA nodes these GPUs are on
+    cuda_devices: str           # The final string for CUDA_VISIBLE_DEVICES (e.g., "4,5")
 
     @classmethod
-    def for_rank(cls, local_rank, gpus_per_task=None, gpu_pci_ids=None):
+    def for_rank(
+        cls, 
+        local_rank: int, 
+        gpus_per_task: int, 
+        ordered_gpus: list[dict]
+    ) -> "GPUAssignment":
         """
-        A factory method that assigns a GPU to a given local rank
-        using a round-robin strategy.
+        A factory method that assigns a slice of GPUs to a given local rank
+        from a pre-ordered, topology-aware list of all GPUs.
         """
-        if not gpu_pci_ids:
+        if not ordered_gpus:
             raise RuntimeError("Attempted to assign a GPU, but no GPUs were discovered.")
+        
+        start_idx = local_rank * gpus_per_task
+        end_idx = start_idx + gpus_per_task
 
-        # Assume one gpu per task, since we are calling this, period
-        gpus_per_task = gpus_per_task or 1
-
-        # 1. Calculate the starting GPU index for this rank.
-        start_gpu_index = local_rank * gpus_per_task
-        end_gpu_index = start_gpu_index + gpus_per_task
-
-        if end_gpu_index > len(gpu_pci_ids):
+        if end_idx > len(ordered_gpus):
             raise ValueError(
                 f"Cannot satisfy request for {gpus_per_task} GPUs for local_rank {local_rank}. "
-                f"Only {len(gpu_pci_ids)} GPUs available in total."
+                f"Only {len(ordered_gpus)} GPUs available in total."
             )
 
-        # Return the assignment
-        assigned_indices = list(range(start_gpu_index, end_gpu_index))
+        assigned_gpu_slice = ordered_gpus[start_idx:end_idx]
+        
+        # The global indices for CUDA_VISIBLE_DEVICES are their positions in the ordered list
+        assigned_indices = list(range(start_idx, end_idx))
+        
         return cls(
             indices=assigned_indices,
-            pci_ids=[gpu_pci_ids[i] for i in assigned_indices],
-            cuda_devices=",".join([str(x) for x in assigned_indices]),
-        )
+            pci_ids=[gpu['pci_id'] for gpu in assigned_gpu_slice],
+            numa_indices={gpu['numa_index'] for gpu in assigned_gpu_slice},
+            cuda_devices=",".join(map(str, assigned_indices))
+        )
diff --git a/fluxbind/shape/shape.py b/fluxbind/shape/shape.py
@@ -135,68 +135,38 @@ def get_gpu_local_binding(self, rule: dict, local_rank: int, gpus_per_task: int)
         """
         Calculate a 'gpu-local' binding using the topology-aware ordered GPU list.
         """
-        if not self.ordered_gpus:
-            raise RuntimeError("Shape specifies 'bind: gpu-local', but no GPUs were discovered.")
-
-        # Assign a slice of GPUs from the canonical, ordered list.
-        start_idx = local_rank * gpus_per_task
-        end_idx = start_idx + gpus_per_task
-        if end_idx > len(self.ordered_gpus):
-            raise ValueError(f"Not enough total GPUs to satisfy request for rank {local_rank}.")
-
-        assigned_gpu_slice = self.ordered_gpus[start_idx:end_idx]
-        cuda_devices = ",".join([str(start_idx + i) for i, _ in enumerate(assigned_gpu_slice)])
-
+        assignment = gpus.GPUAssignment.for_rank(local_rank, gpus_per_task, self.ordered_gpus)
+        
         # The CPU domain is the union of NUMA nodes for the assigned GPUs.
-        local_numa_indices = sorted(list({gpu["numa_index"] for gpu in assigned_gpu_slice}))
-        domain_locations = [f"numa:{i}" for i in local_numa_indices]
-        domain = " ".join(domain_locations)  # e.g., "numa:0" or "numa:0 numa:1"
-
-        # Get the final CPU binding WITHIN that domain.
+        domain_locations = [f"numa:{i}" for i in assignment.numa_indices]
+        domain = " ".join(domain_locations)
         cpu_binding_string = self.get_binding_in_gpu_domain(rule, local_rank, gpus_per_task, domain)
-        return f"{cpu_binding_string};{cuda_devices}"
-
+        return f"{cpu_binding_string};{assignment.cuda_devices}"
+    
     def get_gpu_remote_binding(self, rule: dict, local_rank: int, gpus_per_task: int) -> str:
         """
         Calculates a 'gpu-remote' binding using the topology-aware ordered GPU list.
         """
         if len(self.numa_node_cpusets) < 2:
             raise RuntimeError("'bind: gpu-remote' is invalid on a single-NUMA system.")
-        if not self.ordered_gpus:
-            raise RuntimeError("Shape specifies 'bind: gpu-remote', but no GPUs were discovered.")
-
-        # Assign a slice of GPUs to determine the local NUMA domains.
-        start_idx = local_rank * gpus_per_task
-        end_idx = start_idx + gpus_per_task
-        if end_idx > len(self.ordered_gpus):
-            raise ValueError(f"Not enough total GPUs to satisfy request for rank {local_rank}.")
+        assignment = gpus.GPUAssignment.for_rank(local_rank, gpus_per_task, self.ordered_gpus)
 
-        assigned_gpu_slice = self.ordered_gpus[start_idx:end_idx]
-        cuda_devices = ",".join([str(start_idx + i) for i, _ in enumerate(assigned_gpu_slice)])
-
-        # Find the set of all local NUMA domains for this rank's GPUs.
-        local_numa_indices = {gpu["numa_index"] for gpu in assigned_gpu_slice}
-
-        # Find all remote NUMA domains.
+        # Find all remote NUMA domains relative to the set of local domains.
         all_numa_indices = set(range(len(self.numa_node_cpusets)))
-        remote_numa_indices = sorted(list(all_numa_indices - local_numa_indices))
-
+        remote_numa_indices = sorted(list(all_numa_indices - assignment.numa_indices))
+        
         if not remote_numa_indices:
-            raise RuntimeError(
-                f"Cannot find a remote NUMA node for rank {local_rank}; its GPUs span all NUMA domains."
-            )
-
-        # 4. Select the target remote domain.
-        offset = rule.get("offset", 0)
+            raise RuntimeError(f"Cannot find a remote NUMA node for rank {local_rank}; its GPUs span all NUMA domains.")
+        
+        offset = rule.get('offset', 0)
         if offset >= len(remote_numa_indices):
             raise ValueError(f"Offset {offset} is out of range for remote NUMA domains.")
-
+            
         target_remote_numa_idx = remote_numa_indices[offset]
         domain = f"numa:{target_remote_numa_idx}"
 
-        # Get the final CPU binding WITHIN that remote domain.
         cpu_binding_string = self.get_binding_in_gpu_domain(rule, local_rank, gpus_per_task, domain)
-        return f"{cpu_binding_string};{cuda_devices}"
+        return f"{cpu_binding_string};{assignment.cuda_devices}"
 
     def get_binding_in_gpu_domain(
         self, rule: dict, local_rank: int, gpus_per_task: int, domain: str