feat: add amd gpu support (rocm-smi)

vsoch · vsoch · commit 734292589cda · 2025-10-23T18:26:14.000-06:00
Signed-off-by: vsoch &lt;vsoch@users.noreply.github.com&gt;
diff --git a/examples/shape/google/n1-standard-8/shape_gpu_remote_numa.yaml b/examples/shape/google/n1-standard-8/shape_gpu_remote_numa.yaml
@@ -1,3 +1,3 @@
 default:
   type: numa
-  locality: gpu-remote
+  bind: gpu-remote
diff --git a/fluxbind/scripts/run_mapping.sh b/fluxbind/scripts/run_mapping.sh
@@ -28,7 +28,7 @@ gpus_per_task=${GPUS_PER_TASK:-1}
 # For CPU jobs, CUDA_DEVICE_ID will be the string "NONE".
 echo fluxbind shape --file "$JOB_SHAPE_FILE" --rank "$rank" --node-id "$node_id" --local-rank "$local_rank" --gpus-per-task "$gpus_per_task"
 BIND_INFO=$(fluxbind shape --file "$JOB_SHAPE_FILE" --rank "$rank" --node-id "$node_id" --local-rank "$local_rank" --gpus-per-task "$gpus_per_task")
-echo $BIND_INFO
+echo
 
 # Exit if the helper script failed
 if [ $? -ne 0 ]; then
@@ -40,9 +40,6 @@ fi
 BIND_LOCATION="${BIND_INFO%;*}"
 CUDA_DEVICES="${BIND_INFO#*;}"
 
-echo "BIND_LOCATION: ${BIND_LOCATION}"
-echo "CUDA_DEVICES: ${CUDA_DEVICES}"
-
 if [[ "$CUDA_DEVICES" != "NONE" ]]; then
     export CUDA_VISIBLE_DEVICES=$CUDA_DEVICES
 fi
@@ -56,9 +53,9 @@ if [[ "${BIND_LOCATION}" == "UNBOUND" ]]; then
 else
     # For a bound task, calculate the mask and lists from the target location string.
     binding_source=${BIND_LOCATION}
-    cpuset_mask=$(hwloc-calc "${BIND_LOCATION}")
-    logical_cpu_list=$(hwloc-calc "${BIND_LOCATION}" --intersect PU 2>/dev/null)
-    physical_core_list=$(hwloc-calc "${BIND_LOCATION}" --intersect core 2>/dev/null)
+    cpuset_mask=$(hwloc-calc ${BIND_LOCATION})
+    logical_cpu_list=$(hwloc-calc ${BIND_LOCATION} --intersect PU 2>/dev/null)
+    physical_core_list=$(hwloc-calc ${BIND_LOCATION} --intersect core 2>/dev/null)
 fi
 
 if [[ "$FLUXBIND_NOCOLOR" != "1" ]]
@@ -110,5 +107,5 @@ if [[ "${BIND_LOCATION}" == "UNBOUND" ]]; then
     # Execute the command directly without changing affinity.
     exec "$@"
 else
-    exec hwloc-bind "${BIND_LOCATION}" -- "$@"
+    exec hwloc-bind ${BIND_LOCATION} -- "$@"
 fi
diff --git a/fluxbind/shape/commands.py b/fluxbind/shape/commands.py
@@ -1,6 +1,6 @@
+import json
 import subprocess
 import sys
-from itertools import zip_longest
 
 
 class Command:
@@ -19,7 +19,6 @@ def run(self, command, shell: bool = False):
             return result.stdout.strip()
         except subprocess.CalledProcessError as e:
             cmd_str = command if shell else " ".join(command)
-            print(f"Error running '{cmd_str}': {e.stderr}", file=sys.stderr)
             raise RuntimeError("Command execution failed.") from e
         except FileNotFoundError as e:
             cmd_str = command[0] if isinstance(command, list) else command.split()[0]
@@ -29,15 +28,15 @@ def run(self, command, shell: bool = False):
 class HwlocCalcCommand(Command):
     name = "hwloc-calc"
 
-    def _parse_cpuset_to_list(self, cpuset_str: str) -> list[int]:
+    def parse_cpuset_to_list(self, cpuset_str: str) -> list[int]:
         """
         Convert a potentially comma-separated hex string into a list of integers.
         """
         if not cpuset_str or cpuset_str.lower() in ["0x0", "0"]:
             return [0]
         return [int(chunk, 16) for chunk in cpuset_str.strip().split(",")]
 
-    def _operate_on_lists(self, list_a: list[int], list_b: list[int], operator: str) -> list[int]:
+    def operate_on_lists(self, list_a: list[int], list_b: list[int], operator: str) -> list[int]:
         """
         Perform a bitwise operation on two lists of cpuset integers.
         """
@@ -112,8 +111,8 @@ def union_of_locations(self, locations: list[str]) -> str:
 
         for loc in locations:
             loc_cpuset_str = self.get_cpuset(loc)
-            loc_cpuset_list = self._parse_cpuset_to_list(loc_cpuset_str)
-            union_mask_list = self._union_of_lists(union_mask_list, loc_cpuset_list)
+            loc_cpuset_list = self.parse_cpuset_to_list(loc_cpuset_str)
+            union_mask_list = self.operate_on_lists(union_mask_list, loc_cpuset_list, "+")
         return " ".join([hex(chunk) for chunk in union_mask_list])
 
 
@@ -135,5 +134,28 @@ def get_pci_bus_ids(self) -> list[str]:
         return [bus_id for bus_id in ids if bus_id]
 
 
+class RocmSmiCommand(Command):
+    name = "rocm-smi"
+
+    def get_pci_bus_ids(self) -> list[str]:
+        """
+        Specifically queries for and returns a list of GPU PCI bus IDs.
+        """
+        # The '--showbus' and '--json' flags provide a reliable, machine-readable output.
+        command_str = f"{self.name} --showbus --json"
+
+        # {"card0": {"PCI Bus": "0000:03:00.0"}, "card1": ..., "card7": {"PCI Bus": "0000:E3:00.0"}}
+        output = self.run(command_str, shell=True)
+        data = json.loads(output)
+
+        pci_ids = []
+        # I'm choosing not to sort so the devices are read in the order provided.
+        for card_key in data.keys():
+            card_info = data[card_key]
+            pci_ids.append(card_info.get("PCI Bus"))
+        return pci_ids
+
+
 hwloc_calc = HwlocCalcCommand()
 nvidia_smi = NvidiaSmiCommand()
+rocm_smi = RocmSmiCommand()
diff --git a/fluxbind/shape/shape.py b/fluxbind/shape/shape.py
@@ -26,13 +26,53 @@ def __init__(self, filepath, machine="machine:0"):
         self.num_pus = commands.hwloc_calc.count("pu", within=self.machine)
         self.numa_node_cpusets = commands.hwloc_calc.list_cpusets("numa", within=self.machine)
         self.pus_per_core = self.num_pus // self.num_cores if self.num_cores > 0 else 0
-        self.gpu_pci_ids = self.discover_gpus()
+        # For GPU topology, we care about NUMA nodes.
+        self.gpus_by_numa = self.discover_gpus()
 
-    def discover_gpus(self) -> list:
+    def discover_gpus(self):
         """
         Discovers available GPU PCI bus IDs.
         """
-        return commands.nvidia_smi.get_pci_bus_ids()
+        all_pci_ids = []
+
+        # Try for nvidia and then rocm
+        for command in [commands.nvidia_smi.get_pci_bus_ids, commands.rocm_smi.get_pci_bus_ids]:
+            try:
+                # This is pci addresses ACROSS numa nodes
+                all_pci_ids = command()
+            except Exception:
+                pass
+
+        gpus_by_numa = {}
+
+        # For each GPU, find out which NUMA node it belongs to
+        for pci_id in all_pci_ids:
+            # Ask hwloc for the cpuset where the pci lives.
+            # I'm not sure if this will work for nvidia if doens't show in lstopo
+            gpu_cpuset = commands.hwloc_calc.get_cpuset(f"pci={pci_id}")
+            found_numa = False
+            for i, numa_cpuset in enumerate(self.numa_node_cpusets):
+                # Check if the GPU's cpuset is a subset of this NUMA node's cpuset
+                intersection = commands.hwloc_calc.get_cpuset(f"'{gpu_cpuset}' x '{numa_cpuset}'")
+
+                if intersection == gpu_cpuset:
+                    if i not in gpus_by_numa:
+                        gpus_by_numa[i] = []
+                    gpus_by_numa[i].append(pci_id)
+                    found_numa = True
+                    break
+
+            # Raise an error - I want to know about this case.
+            if not found_numa:
+                raise ValueError(f"Warning: Could not determine NUMA locality for GPU {pci_id}")
+
+        # Make an ordered set just for easy list access
+        self.ordered_gpus = []
+        for numa_idx in sorted(gpus_by_numa.keys()):
+            for pci_id in gpus_by_numa[numa_idx]:
+                self.ordered_gpus.append({"pci_id": pci_id, "numa_index": numa_idx})
+
+        return gpus_by_numa
 
     def load_file(self, filepath):
         """
@@ -92,61 +132,106 @@ def find_matching_rule(self, rank: int, node_id: int) -> dict:
         return None
 
     def get_gpu_local_binding(self, rule: dict, local_rank: int, gpus_per_task: int) -> str:
-        assignment = gpus.GPUAssignment.for_rank(local_rank, gpus_per_task, self.gpu_pci_ids)
-        pci_locations = [f"pci={pci_id}" for pci_id in assignment.pci_ids]
-        domain_cpuset = commands.hwloc_calc.union_of_locations(pci_locations)
-        cpu_binding_string = self.get_gpu_cpu_binding(
-            rule, local_rank, gpus_per_task, domain_cpuset
-        )
-        return f"{cpu_binding_string};{assignment.cuda_devices}"
+        """
+        Calculate a 'gpu-local' binding using the topology-aware ordered GPU list.
+        """
+        if not self.ordered_gpus:
+            raise RuntimeError("Shape specifies 'bind: gpu-local', but no GPUs were discovered.")
+
+        # Assign a slice of GPUs from the canonical, ordered list.
+        start_idx = local_rank * gpus_per_task
+        end_idx = start_idx + gpus_per_task
+        if end_idx > len(self.ordered_gpus):
+            raise ValueError(f"Not enough total GPUs to satisfy request for rank {local_rank}.")
+
+        assigned_gpu_slice = self.ordered_gpus[start_idx:end_idx]
+        cuda_devices = ",".join([str(start_idx + i) for i, _ in enumerate(assigned_gpu_slice)])
+
+        # The CPU domain is the union of NUMA nodes for the assigned GPUs.
+        local_numa_indices = sorted(list({gpu["numa_index"] for gpu in assigned_gpu_slice}))
+        domain_locations = [f"numa:{i}" for i in local_numa_indices]
+        domain = " ".join(domain_locations)  # e.g., "numa:0" or "numa:0 numa:1"
+
+        # Get the final CPU binding WITHIN that domain.
+        cpu_binding_string = self.get_binding_in_gpu_domain(rule, local_rank, gpus_per_task, domain)
+        return f"{cpu_binding_string};{cuda_devices}"
 
     def get_gpu_remote_binding(self, rule: dict, local_rank: int, gpus_per_task: int) -> str:
+        """
+        Calculates a 'gpu-remote' binding using the topology-aware ordered GPU list.
+        """
         if len(self.numa_node_cpusets) < 2:
             raise RuntimeError("'bind: gpu-remote' is invalid on a single-NUMA system.")
-        assignment = gpus.GPUAssignment.for_rank(local_rank, gpus_per_task, self.gpu_pci_ids)
-        primary_gpu_pci_id = assignment.pci_ids[0]
-        primary_gpu_numa_cpuset = commands.hwloc_calc.get_cpuset(f"numaof:pci={primary_gpu_pci_id}")
-        remote_numa_cpusets = [cs for cs in self.numa_node_cpusets if cs != primary_gpu_numa_cpuset]
-        if not remote_numa_cpusets:
+        if not self.ordered_gpus:
+            raise RuntimeError("Shape specifies 'bind: gpu-remote', but no GPUs were discovered.")
+
+        # Assign a slice of GPUs to determine the local NUMA domains.
+        start_idx = local_rank * gpus_per_task
+        end_idx = start_idx + gpus_per_task
+        if end_idx > len(self.ordered_gpus):
+            raise ValueError(f"Not enough total GPUs to satisfy request for rank {local_rank}.")
+
+        assigned_gpu_slice = self.ordered_gpus[start_idx:end_idx]
+        cuda_devices = ",".join([str(start_idx + i) for i, _ in enumerate(assigned_gpu_slice)])
+
+        # Find the set of all local NUMA domains for this rank's GPUs.
+        local_numa_indices = {gpu["numa_index"] for gpu in assigned_gpu_slice}
+
+        # Find all remote NUMA domains.
+        all_numa_indices = set(range(len(self.numa_node_cpusets)))
+        remote_numa_indices = sorted(list(all_numa_indices - local_numa_indices))
+
+        if not remote_numa_indices:
             raise RuntimeError(
-                f"Could not find a NUMA node remote from GPU {assignment.indices[0]}."
+                f"Cannot find a remote NUMA node for rank {local_rank}; its GPUs span all NUMA domains."
             )
+
+        # 4. Select the target remote domain.
         offset = rule.get("offset", 0)
-        domain = remote_numa_cpusets[offset]
-        cpu_binding_string = self.get_gpu_cpu_binding(rule, local_rank, gpus_per_task, domain)
-        return f"{cpu_binding_string};{assignment.cuda_devices}"
+        if offset >= len(remote_numa_indices):
+            raise ValueError(f"Offset {offset} is out of range for remote NUMA domains.")
+
+        target_remote_numa_idx = remote_numa_indices[offset]
+        domain = f"numa:{target_remote_numa_idx}"
 
-    def get_gpu_cpu_binding(
+        # Get the final CPU binding WITHIN that remote domain.
+        cpu_binding_string = self.get_binding_in_gpu_domain(rule, local_rank, gpus_per_task, domain)
+        return f"{cpu_binding_string};{cuda_devices}"
+
+    def get_binding_in_gpu_domain(
         self, rule: dict, local_rank: int, gpus_per_task: int, domain: str
     ) -> str:
+        """
+        A dedicated binding engine for GPU jobs. It applies user preferences within a calculated domain
+        (e.g., "numa:0" or "numa:0 numa:1").
+        """
         hwloc_type = rule.get("type")
         if not hwloc_type:
             raise ValueError("Rule with GPU binding must have a 'type'.")
 
         if hwloc_type in ["numa", "package", "l3cache", "machine"]:
-            # The user wants to bind to the entire domain, try to get location for it
-            return commands.hwloc_calc.get_cpuset(f"'{domain}'")
-
-        elif hwloc_type in ["core", "pu", "l2cache"]:
-            # This logic returns a simple name like "core:5", which is correct.
-            if "prefer" in rule:
-                try:
-                    requested_index = int(rule["prefer"])
-                    return commands.hwloc_calc.get_object_in_set(
-                        domain, hwloc_type, requested_index
-                    )
-                except (ValueError, RuntimeError, TypeError):
-                    print(
-                        f"Warning: Preferred index '{rule['prefer']}' invalid/not in domain '{domain}'. Falling back.",
-                        file=sys.stderr,
-                    )
-
-            rank_index_in_gpu_group = local_rank // gpus_per_task
-            return commands.hwloc_calc.get_object_in_set(
-                domain, hwloc_type, rank_index_in_gpu_group
-            )
-        else:
-            raise ValueError(f"Unsupported type '{hwloc_type}' for GPU locality binding.")
+            # If a broad type is requested, the binding is the domain itself.
+            return domain
+
+        if "prefer" in rule:
+            try:
+                requested_index = int(rule["prefer"])
+                # Validate by attempting to get the object.
+                return commands.hwloc_calc.get_object_in_set(domain, hwloc_type, requested_index)
+            except (ValueError, RuntimeError, TypeError):
+                print(
+                    f"Warning: Preferred index '{rule['prefer']}' invalid/not in domain '{domain}'. Falling back.",
+                    file=sys.stderr,
+                )
+
+        # Default assignment: Rank's Nth turn for a resource of this type within its GPU group.
+        # This is the correct index for packing sub-objects within a domain.
+        index = local_rank // gpus_per_task if gpus_per_task > 0 else local_rank
+
+        # For certain patterns like interleave or spread, the index calculation
+        # would need to be more complex, but for a simple packed pattern this is the logic.
+        # Let's assume a simple packed logic for now as pattern is not yet implemented here.
+        return commands.hwloc_calc.get_object_in_set(domain, hwloc_type, index)
 
     def get_binding_for_rank(self, rank, node_id, local_rank, gpus_per_task=None) -> str:
         """