cli: add support for gpu affinity and tasks

vsoch · vsoch · commit 01e7a62aa0b9 · 2025-10-23T18:26:14.000-06:00
Signed-off-by: vsoch &lt;vsoch@users.noreply.github.com&gt;
diff --git a/examples/shape/google/n1-standard-8/shape_gpu_local_core.yaml b/examples/shape/google/n1-standard-8/shape_gpu_local_core.yaml
@@ -1,3 +1,3 @@
 default:
   type: core
-  on: gpu-local
+  bind: gpu-local
diff --git a/examples/shape/google/n1-standard-8/shape_gpu_local_numa.yaml b/examples/shape/google/n1-standard-8/shape_gpu_local_numa.yaml
@@ -1,3 +1,3 @@
 default:
   type: numa
-  on: gpu-local
+  bind: gpu-local
diff --git a/fluxbind/bind/bind.py b/fluxbind/bind/bind.py
@@ -108,6 +108,8 @@ def __init__(self, **kwargs):
             "exclusive",
             "cores_per_task",
             "tasks_per_core",
+            "gpus_per_task",
+            "gpu_affinity",
             "cpu_affinity",
             "taskmap",
             "command",
@@ -172,19 +174,31 @@ def get_custom_command(self):
         A custom command uses flux to ask for a specific binding
         """
         cmd = ["flux", "run", "-N", str(self.nodes)]
+        cmd = self.set_envars(cmd)
+        cmd = self.set_flags(cmd)
+
+        # This is the main difference between a shape and not command.
+        # We don't have to ask for exclusive here. For shape, we do.
+        if self.exclusive:
+            cmd.append("--exclusive")
+        return cmd
+
+    def set_flags(self, cmd):
+        """
+        Set command flags.
+        """
         if self.tasks is not None:
             cmd += ["-n", str(self.tasks)]
         if self.cpu_affinity is not None:
             cmd += ["-o", f"cpu-affinity={self.cpu_affinity}"]
-        if self.exclusive:
-            cmd.append("--exclusive")
+        if self.gpu_affinity is not None:
+            cmd += ["-o", f"gpu-affinity={self.gpu_affinity}"]
         if self.cores_per_task is not None:
             cmd += ["--cores-per-task", str(self.cores_per_task)]
         if self.tasks_per_core is not None:
             cmd += ["--tasks-per-core", str(self.tasks_per_core)]
         if self.taskmap is not None:
             cmd += [f"--taskmap={self.taskmap}"]
-        cmd = self.set_envars(cmd)
         return cmd
 
     def set_envars(self, cmd):
@@ -209,6 +223,8 @@ def get_shape_command(self):
         """
         A shape command requires exclusive (for now) and then exports
         (provides) the JOB_SHAPE_FILE to the job.
+
+        TODO combine this into one command.
         """
         cmd = [
             "flux",
@@ -218,16 +234,7 @@ def get_shape_command(self):
             "--exclusive",
         ]
         cmd = self.set_envars(cmd)
-        if self.tasks is not None:
-            cmd += ["-n", str(self.tasks)]
-        if self.tasks_per_core is not None:
-            cmd += ["--tasks-per-core", str(self.tasks_per_core)]
-        if self.cpu_affinity is not None:
-            cmd += ["-o", f"cpu-affinity={self.cpu_affinity}"]
-        if self.env is not None:
-            for envar in self.env:
-                cmd += ["--env", envar]
-        return cmd
+        return self.set_flags(cmd)
 
     def execute(self, script):
         """
diff --git a/fluxbind/cli/__init__.py b/fluxbind/cli/__init__.py
@@ -29,6 +29,13 @@ def get_parser():
         action="store_true",
     )
 
+    parser.add_argument(
+        "--quiet",
+        dest="quiet",
+        help="suppress additional output.",
+        default=False,
+        action="store_true",
+    )
     parser.add_argument(
         "--version",
         dest="version",
@@ -64,7 +71,11 @@ def get_parser():
         "--cpu-affinity",
         default=None,
         help="Add cpu-affinity",
-        # choices=["none", "per-task"],
+    )
+    run.add_argument(
+        "--gpu-affinity",
+        default=None,
+        help="Add gpu-affinity",
     )
     run.add_argument("-N", "--nodes", type=int, default=1, help="The number of nodes (default: 1).")
     run.add_argument(
@@ -93,32 +104,12 @@ def get_parser():
         help="The number of CORES (not PUs) to bind per task.",
     )
     run.add_argument(
-        "--tasks-per-core",
+        "-g",
+        "--gpus-per-task",
         type=int,
         default=None,
-        help="The number of tasks per core.",
-    )
-    run.add_argument(
-        "--silent",
-        dest="silent",
-        help="no additional output.",
-        default=False,
-        action="store_true",
-    )
-    run.add_argument(
-        "--quiet",
-        dest="quiet",
-        help="suppress additional output (only print fluxbind mapping)",
-        default=False,
-        action="store_true",
-    )
-    run.add_argument(
-        "--nocolor",
-        help="suppress color output (e.g., piping to log)",
-        default=False,
-        action="store_true",
+        help="The number of GPUs per task.",
     )
-
     predict = subparsers.add_parser(
         "predict",
         formatter_class=argparse.RawTextHelpFormatter,
@@ -186,7 +177,7 @@ def help(return_code=0):
         sys.exit(0)
 
     setup_logger(
-        quiet=False,
+        quiet=args.quiet,
         debug=args.debug,
     )
 
diff --git a/fluxbind/scripts/run_mapping.sh b/fluxbind/scripts/run_mapping.sh
@@ -24,7 +24,7 @@ fi
 # Call the fluxbind helper script to get the target location string (e.g., "core:0" or "UNBOUND")
 # It ALWAYS returns a single line in the format: BIND_LOCATION,CUDA_DEVICE_ID
 # For CPU jobs, CUDA_DEVICE_ID will be the string "NONE".
-BIND_LOCATION=$(fluxbind shape --file "$JOB_SHAPE_FILE" --rank "$rank" --node-id "$node_id" --local-rank "$local_rank")
+BIND_INFO=$(fluxbind shape --file "$JOB_SHAPE_FILE" --rank "$rank" --node-id "$node_id" --local-rank "$local_rank")
 
 # Exit if the helper script failed
 if [ $? -ne 0 ]; then
@@ -80,6 +80,9 @@ if [[ "$FLUXBIND_QUIET" != "1" ]]
   echo -e "${prefix}: Effective Cpuset Mask:  ${CYAN}$cpuset_mask${RESET}"
   echo -e "${prefix}: Logical CPUs (PUs):     ${BLUE}${logical_cpu_list:-none}${RESET}"
   echo -e "${prefix}: Physical Cores:         ${ORANGE}${physical_core_list:-none}${RESET}"
+  if [[ "$CUDA_DEVICE" != "NONE" ]]; then
+    echo -e "${prefix}: CUDA Devices:           ${YELLOW}${CUDA_DEVICE}${RESET}"
+  fi
   echo
 fi
 
@@ -92,6 +95,11 @@ if [[ "${BIND_LOCATION}" == "UNBOUND" ]]; then
     exec "$@"
 else
     # Use hwloc-bind to set the affinity and then execute the command.
-    if [[ "$FLUXBIND_SILENT" != "1" ]]; then echo -e "${GREEN}fluxbind${RESET}: Rank ${rank} is bound to ${BIND_LOCATION} to execute: $@" >&2; fi
+    if [[ "$FLUXBIND_SILENT" != "1" ]]; then
+      if [[ "$CUDA_DEVICE" == "NONE" ]]; then
+        echo -e "${GREEN}fluxbind${RESET}: Rank ${rank} is bound to ${BIND_LOCATION} cuda:${CUDA_DEVICE} to execute: $@" >&2; fi
+      else
+        echo -e "${GREEN}fluxbind${RESET}: Rank ${rank} is bound to ${BIND_LOCATION} to execute: $@" >&2; fi
+      fi
     exec hwloc-bind "${BIND_LOCATION}" -- "$@"
 fi
diff --git a/fluxbind/shape/commands.py b/fluxbind/shape/commands.py
@@ -1,5 +1,4 @@
-# In file: fluxbind/commands.py
-import re
+import shlex
 import subprocess
 import sys
 
@@ -35,9 +34,12 @@ def execute(self, args_list: list) -> str:
         Executes hwloc-calc with a list of arguments.
         This is safer as it avoids shell interpretation of the arguments.
         """
+        if isinstance(args_list, str):
+            args_list = shlex.split(args_list)
+
         # A more robust validation could be added here if needed,
         command_list = [self.name] + args_list
-        return self._run(command_list, shell=False)
+        return self.run(command_list, shell=False)
 
 
 class NvidiaSmiCommand(Command):
@@ -51,7 +53,7 @@ def get_pci_bus_ids(self) -> list[str]:
         command_str = f"{self.name} --query-gpu=pci.bus_id --format=csv,noheader"
 
         # shell=True is safe here because the entire command is static and defined internally.
-        output = self._run(command_str, shell=True)
+        output = self.run(command_str, shell=True)
 
         # Parse the output into a clean list
         ids = output.strip().split("\n")
diff --git a/fluxbind/shape/shape.py b/fluxbind/shape/shape.py
@@ -104,31 +104,60 @@ def find_matching_rule(self, rank: int, node_id: int) -> dict:
     def get_gpu_local_binding(self, rule: dict, local_rank: int) -> str:
         """
         Calculates binding for a rank based on its proximity to an assigned GPU.
+        Supports an optional prefer key for user-preferred object selection.
         """
         if not self.gpu_pci_ids:
-            raise RuntimeError("Shape specifies 'on: gpu-local', but no GPUs were discovered.")
+            raise RuntimeError(
+                "Shape specifies 'locality: gpu-local', but no GPUs were discovered."
+            )
 
         num_gpus = len(self.gpu_pci_ids)
         hwloc_type = rule.get("type")
         if not hwloc_type:
-            raise ValueError(
-                "Rule with 'on: gpu-local' must also specify a 'type' (e.g., core, numa)."
-            )
+            raise ValueError("Rule with 'locality: gpu-local' must also specify a 'type'.")
 
+        # 1. Assign a GPU to this rank (round-robin)
         target_gpu_index = local_rank % num_gpus
         cuda_devices = str(target_gpu_index)
         target_gpu_pci_id = self.gpu_pci_ids[target_gpu_index]
+
+        # 2. Get the cpuset for the GPU's locality domain
         gpu_locality_cpuset = commands.hwloc_calc.execute([f"pci={target_gpu_pci_id}"])
+        cpu_binding_string = ""
 
+        # 3. Determine the final CPU binding
         if hwloc_type in ["numa", "package", "l3cache"]:
             cpu_binding_string = gpu_locality_cpuset
+
         elif hwloc_type in ["core", "pu", "l2cache"]:
-            rank_index_in_gpu_group = local_rank // num_gpus
-            cpu_binding_string = commands.HwlocCalcCommand().execute(
-                f'"{gpu_locality_cpuset}" -I {hwloc_type}:{rank_index_in_gpu_group}'
+            all_objects_in_domain_str = commands.hwloc_calc.execute(
+                [gpu_locality_cpuset, "--intersect", hwloc_type]
             )
+            available_indices = all_objects_in_domain_str.split(",")
+            target_object_index = None
+
+            # Is the user asking to prefer a specific identifier?
+            if "prefer" in rule:
+                requested_index = str(rule["prefer"])
+                if requested_index in available_indices:
+                    target_object_index = requested_index
+
+            if target_object_index is None:
+                rank_index_in_gpu_group = local_rank // num_gpus
+                try:
+                    target_object_index = available_indices[rank_index_in_gpu_group]
+                except IndexError:
+                    raise ValueError(
+                        f"Cannot find the {rank_index_in_gpu_group}-th '{hwloc_type}' for local_rank {local_rank} "
+                        f"within the GPU's locality. Only {len(available_indices)} are available."
+                    )
+
+            cpu_binding_string = f"{hwloc_type}:{target_object_index}"
         else:
-            raise ValueError(f"Unsupported type '{hwloc_type}' for 'on: gpu-local' binding.")
+            raise ValueError(f"Unsupported type '{hwloc_type}' for 'locality: gpu-local' binding.")
+
+        if not cpu_binding_string:
+            raise RuntimeError("Failed to calculate a valid cpu_binding_string.")
 
         return f"{cpu_binding_string},{cuda_devices}"
 
@@ -155,7 +184,7 @@ def get_binding_for_rank(self, rank: int, node_id: int, local_rank: int) -> str:
             raise ValueError(f"Matching rule has no 'type' defined: {rule}")
 
         # Are we doing something with GPU?
-        if rule.get("on") == "gpu-local":
+        if rule.get("bind") == "gpu-local":
             return self.get_gpu_local_binding(rule, local_rank)
         cpu_binding_string = self.get_cpu_binding(hwloc_type, rule, local_rank)