Skip to content

Commit ef0f985

Browse files
committed
gpus: refactor remote gpu definition
Signed-off-by: vsoch <[email protected]>
1 parent 30ad40a commit ef0f985

File tree

7 files changed

+146
-145
lines changed

7 files changed

+146
-145
lines changed

fluxbind/bind/bind.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,9 @@ def set_envars(self, cmd):
209209
"""
210210
if self.quiet:
211211
cmd += ["--env", "FLUXBIND_QUIET=1"]
212+
# This is to pass to the shape calculator
213+
if self.gpus_per_task not in [0, None]:
214+
cmd += ["--env", f"GPUS_PER_TASK={self.gpus_per_task}"]
212215
if self.nocolor:
213216
cmd += ["--env", "FLUXBIND_NOCOLOR=1"]
214217
if self.silent:

fluxbind/cli/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,9 @@ def get_parser():
136136
shape.add_argument(
137137
"--local-rank", required=True, type=int, help="Rank of the process on the local node."
138138
)
139+
shape.add_argument(
140+
"--gpus-per-task", dest="gpus_per_task", type=int, help="Number of GPUs per task."
141+
)
139142

140143
for command in [predict, run]:
141144
command.add_argument(

fluxbind/cli/shape.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@ def main(args, extra, **kwargs):
99

1010
# 2. Call the public method to get the final binding string
1111
binding_string = shape.get_binding_for_rank(
12-
rank=args.rank, node_id=args.node_id, local_rank=args.local_rank
12+
rank=args.rank,
13+
node_id=args.node_id,
14+
local_rank=args.local_rank,
15+
gpus_per_task=args.gpus_per_task,
1316
)
1417

1518
# 3. Print the result to stdout for the wrapper script

fluxbind/scripts/run_mapping.sh

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,14 @@ if [ -z "$JOB_SHAPE_FILE" ]; then
2121
exit 1
2222
fi
2323

24+
gpus_per_task=${GPUS_PER_TASK:-1}
25+
2426
# Call the fluxbind helper script to get the target location string (e.g., "core:0" or "UNBOUND")
2527
# It ALWAYS returns a single line in the format: BIND_LOCATION,CUDA_DEVICE_ID
2628
# For CPU jobs, CUDA_DEVICE_ID will be the string "NONE".
27-
echo fluxbind shape --file "$JOB_SHAPE_FILE" --rank "$rank" --node-id "$node_id" --local-rank "$local_rank"
28-
BIND_INFO=$(fluxbind shape --file "$JOB_SHAPE_FILE" --rank "$rank" --node-id "$node_id" --local-rank "$local_rank")
29+
echo fluxbind shape --file "$JOB_SHAPE_FILE" --rank "$rank" --node-id "$node_id" --local-rank "$local_rank" --gpus-per-task "$gpus_per_task"
30+
BIND_INFO=$(fluxbind shape --file "$JOB_SHAPE_FILE" --rank "$rank" --node-id "$node_id" --local-rank "$local_rank" --gpus-per-task "$gpus_per_task")
31+
echo $BIND_INFO
2932

3033
# Exit if the helper script failed
3134
if [ $? -ne 0 ]; then
@@ -34,10 +37,14 @@ if [ $? -ne 0 ]; then
3437
fi
3538

3639
# 3. Parse the simple, machine-readable output.
37-
BIND_LOCATION=$(echo "$BIND_INFO" | cut -d',' -f1)
38-
CUDA_DEVICE=$(echo "$BIND_INFO" | cut -d',' -f2)
39-
if [[ "$CUDA_DEVICE" != "NONE" ]]; then
40-
export CUDA_VISIBLE_DEVICES=$CUDA_DEVICE
40+
BIND_LOCATION="${BIND_INFO%;*}"
41+
CUDA_DEVICES="${BIND_INFO#*;}"
42+
43+
echo "BIND_LOCATION: ${BIND_LOCATION}"
44+
echo "CUDA_DEVICES: ${CUDA_DEVICES}"
45+
46+
if [[ "$CUDA_DEVICES" != "NONE" ]]; then
47+
export CUDA_VISIBLE_DEVICES=$CUDA_DEVICES
4148
fi
4249

4350
if [[ "${BIND_LOCATION}" == "UNBOUND" ]]; then
@@ -81,26 +88,27 @@ if [[ "$FLUXBIND_QUIET" != "1" ]]
8188
echo -e "${prefix}: Effective Cpuset Mask: ${CYAN}$cpuset_mask${RESET}"
8289
echo -e "${prefix}: Logical CPUs (PUs): ${BLUE}${logical_cpu_list:-none}${RESET}"
8390
echo -e "${prefix}: Physical Cores: ${ORANGE}${physical_core_list:-none}${RESET}"
84-
if [[ "$CUDA_DEVICE" != "NONE" ]]; then
85-
echo -e "${prefix}: CUDA Devices: ${YELLOW}${CUDA_DEVICE}${RESET}"
91+
if [[ "$CUDA_DEVICES" != "NONE" ]]; then
92+
echo -e "${prefix}: CUDA Devices: ${YELLOW}${CUDA_DEVICES}${RESET}"
8693
fi
8794
echo
8895
fi
8996

9097
# The 'exec' command replaces this script's process, preserving the env.
9198
# I learned this developing singularity shell, exec, etc :)
99+
if [[ "${BIND_LOCATION}" == "UNBOUND" ]]; then
100+
if [[ "$FLUXBIND_SILENT" != "1" ]]; then echo -e "${GREEN}fluxbind${RESET}: Rank ${rank} is ${BIND_LOCATION} to execute: $@" >&2; fi
101+
else
102+
if [[ "$CUDA_DEVICES" == "NONE" ]]; then
103+
echo -e "${GREEN}fluxbind${RESET}: Rank ${rank} is bound to ${BIND_LOCATION} cuda:${CUDA_DEVICES} to execute: $@" >&2;
104+
else
105+
echo -e "${GREEN}fluxbind${RESET}: Rank ${rank} is bound to ${BIND_LOCATION} to execute: $@" >&2;
106+
fi
107+
fi
92108

93109
if [[ "${BIND_LOCATION}" == "UNBOUND" ]]; then
94110
# Execute the command directly without changing affinity.
95-
if [[ "$FLUXBIND_SILENT" != "1" ]]; then echo -e "${GREEN}fluxbind${RESET}: Rank ${rank} is ${BIND_LOCATION} to execute: $@" >&2; fi
96111
exec "$@"
97112
else
98-
# Use hwloc-bind to set the affinity and then execute the command.
99-
if [[ "$FLUXBIND_SILENT" != "1" ]]; then
100-
if [[ "$CUDA_DEVICE" == "NONE" ]]; then
101-
echo -e "${GREEN}fluxbind${RESET}: Rank ${rank} is bound to ${BIND_LOCATION} cuda:${CUDA_DEVICE} to execute: $@" >&2; fi
102-
else
103-
echo -e "${GREEN}fluxbind${RESET}: Rank ${rank} is bound to ${BIND_LOCATION} to execute: $@" >&2; fi
104-
fi
105113
exec hwloc-bind "${BIND_LOCATION}" -- "$@"
106114
fi

fluxbind/shape/commands.py

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
import shlex
21
import subprocess
32
import sys
3+
from itertools import zip_longest
44

55

66
class Command:
@@ -29,64 +29,93 @@ def run(self, command, shell: bool = False):
2929
class HwlocCalcCommand(Command):
3030
name = "hwloc-calc"
3131

32+
def _parse_cpuset_to_list(self, cpuset_str: str) -> list[int]:
33+
"""
34+
Convert a potentially comma-separated hex string into a list of integers.
35+
"""
36+
if not cpuset_str or cpuset_str.lower() in ["0x0", "0"]:
37+
return [0]
38+
return [int(chunk, 16) for chunk in cpuset_str.strip().split(",")]
39+
40+
def _operate_on_lists(self, list_a: list[int], list_b: list[int], operator: str) -> list[int]:
41+
"""
42+
Perform a bitwise operation on two lists of cpuset integers.
43+
"""
44+
max_len = max(len(list_a), len(list_b))
45+
result_list = []
46+
for i in range(max_len):
47+
val_a = list_a[i] if i < len(list_a) else 0
48+
val_b = list_b[i] if i < len(list_b) else 0
49+
50+
if operator == "+":
51+
result_list.append(val_a | val_b)
52+
elif operator == "x":
53+
result_list.append(val_a & val_b)
54+
elif operator == "^":
55+
result_list.append(val_a ^ val_b)
56+
elif operator == "~":
57+
result_list.append(val_a & ~val_b)
58+
else:
59+
raise ValueError(f"Unsupported operator '{operator}'")
60+
return result_list
61+
3262
def count(self, hw_type: str, within: str = "machine:0") -> int:
3363
"""
3464
Returns the total number of a specific hardware object.
35-
36-
Args:
37-
hw_type: The type of object to count (e.g., "core", "numa").
38-
within_object: Optional object to restrict the count to (e.g., "numa:0").
3965
"""
4066
try:
4167
args = ["--number-of", hw_type, within]
42-
result_stdout = self.run([self.name] + args)
68+
result_stdout = self.run([self.name] + args, shell=False)
4369
return int(result_stdout)
4470
except (RuntimeError, ValueError) as e:
4571
raise RuntimeError(f"Failed to count number of '{hw_type}': {e}")
4672

4773
def list_cpusets(self, hw_type: str, within: str = "machine:0") -> list[str]:
4874
"""
4975
Returns a list of cpuset strings for each object of a given type.
50-
51-
Args:
52-
hw_type: The type of object to list (e.g., "numa").
53-
within_object: Optional object to restrict the list to.
5476
"""
5577
try:
56-
# Get the indices of all objects of this type
5778
args_intersect = ["--intersect", hw_type, within]
58-
indices_str = self.run([self.name] + args_intersect)
79+
indices_str = self.run([self.name] + args_intersect, shell=False)
5980
indices = indices_str.split(",")
60-
61-
# Cut out early
6281
if not indices or not indices[0]:
6382
return []
64-
65-
# For each index, get its specific cpuset
66-
return [self.run([self.name, f"{hw_type}:{i}"]) for i in indices]
83+
return [self.run([self.name, f"{hw_type}:{i}"], shell=False) for i in indices]
6784
except (RuntimeError, ValueError) as e:
6885
raise RuntimeError(f"Failed to list cpusets for '{hw_type}': {e}")
6986

7087
def get_cpuset(self, location: str) -> str:
7188
"""
72-
Gets the cpuset for a single, specific location string (e.g., "pci=...", "core:0").
89+
Gets the cpuset for one or more space/operator-separated location strings.
7390
"""
74-
return self.run([self.name, location])
91+
return self.run(f"{self.name} {location}", shell=True)
7592

7693
def get_object_in_set(self, cpuset: str, obj_type: str, index: int) -> str:
7794
"""
7895
Gets the Nth object of a type within a given cpuset.
79-
e.g., find the 1st 'core' within cpuset '0x00ff'.
8096
"""
81-
# This uses the robust two-step process internally
82-
all_objects_str = self.run([self.name, cpuset, "--intersect", obj_type])
83-
available_indices = all_objects_str.split(",")
97+
list_cmd = f"{self.name} '{cpuset}' --intersect {obj_type}"
98+
all_indices_str = self.run(list_cmd, shell=True)
99+
available_indices = all_indices_str.split(",")
84100
try:
85101
target_index = available_indices[index]
86102
return f"{obj_type}:{target_index}"
87103
except IndexError:
88104
raise ValueError(f"Cannot find the {index}-th '{obj_type}' in cpuset {cpuset}.")
89105

106+
def union_of_locations(self, locations: list[str]) -> str:
107+
"""
108+
Calculates the union of a list of hwloc location strings using Python logic.
109+
Returns a single, SPACE-separated string of hex cpusets.
110+
"""
111+
union_mask_list = [0]
112+
113+
for loc in locations:
114+
loc_cpuset_str = self.get_cpuset(loc)
115+
loc_cpuset_list = self._parse_cpuset_to_list(loc_cpuset_str)
116+
union_mask_list = self._union_of_lists(union_mask_list, loc_cpuset_list)
117+
return " ".join([hex(chunk) for chunk in union_mask_list])
118+
90119

91120
class NvidiaSmiCommand(Command):
92121
name = "nvidia-smi"

fluxbind/shape/gpu.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,36 @@ class GPUAssignment:
77
Data structure to hold information about a rank's assigned GPU.
88
"""
99

10-
index: int # logical index of the GPU
11-
pci_id: str # The PCI bus ID of the GPU
10+
indices: list[int] # logical index of the GPU
11+
pci_ids: list[str] # The PCI bus ID of the GPU
1212
cuda_devices: str # CUDA_VISIBLE_DEVICES
1313

1414
@classmethod
15-
def for_rank(cls, local_rank, gpu_pci_ids):
15+
def for_rank(cls, local_rank, gpus_per_task=None, gpu_pci_ids=None):
1616
"""
1717
A factory method that assigns a GPU to a given local rank
1818
using a round-robin strategy.
1919
"""
2020
if not gpu_pci_ids:
2121
raise RuntimeError("Attempted to assign a GPU, but no GPUs were discovered.")
2222

23-
num_gpus = len(gpu_pci_ids)
24-
target_gpu_index = local_rank % num_gpus
23+
# Assume one gpu per task, since we are calling this, period
24+
gpus_per_task = gpus_per_task or 1
25+
26+
# 1. Calculate the starting GPU index for this rank.
27+
start_gpu_index = local_rank * gpus_per_task
28+
end_gpu_index = start_gpu_index + gpus_per_task
29+
30+
if end_gpu_index > len(gpu_pci_ids):
31+
raise ValueError(
32+
f"Cannot satisfy request for {gpus_per_task} GPUs for local_rank {local_rank}. "
33+
f"Only {len(gpu_pci_ids)} GPUs available in total."
34+
)
2535

2636
# Return the assignment
37+
assigned_indices = list(range(start_gpu_index, end_gpu_index))
2738
return cls(
28-
index=target_gpu_index,
29-
pci_id=gpu_pci_ids[target_gpu_index],
30-
cuda_devices=str(target_gpu_index),
39+
indices=assigned_indices,
40+
pci_ids=[gpu_pci_ids[i] for i in assigned_indices],
41+
cuda_devices=",".join([str(x) for x in assigned_indices]),
3142
)

0 commit comments

Comments
 (0)