Skip to content

Commit ce6b5d2

Browse files
committed
refactor: support backend visible devices value alignment
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent ef85950 commit ce6b5d2

File tree

4 files changed

+98
-20
lines changed

4 files changed

+98
-20
lines changed

gpustack_runtime/deployer/__types__.py

Lines changed: 66 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,6 +1280,17 @@ class Deployer(ABC):
12801280
"AMD_VISIBLE_DEVICES": ["0", "1"]
12811281
}.
12821282
"""
1283+
_visible_devices_values_alignment: dict[str, dict[str, str]] | None = None
1284+
"""
1285+
Recorded visible devices values alignment,
1286+
the key is the runtime visible devices env name,
1287+
the value is the mapping from backend device index to aligned index.
1288+
For example:
1289+
{
1290+
"NVIDIA_VISIBLE_DEVICES": {"0": "0"},
1291+
"AMD_VISIBLE_DEVICES": {"0": "0", "1": "1"}
1292+
}.
1293+
"""
12831294

12841295
@staticmethod
12851296
@abstractmethod
@@ -1325,6 +1336,7 @@ def _fetch_visible_devices_env_values(self):
13251336

13261337
self._visible_devices_env = {}
13271338
self._visible_devices_values = {}
1339+
self._visible_devices_values_alignment = {}
13281340

13291341
devices: dict[ManufacturerEnum, Devices] = {}
13301342
for dev in detect_devices(fast=False):
@@ -1333,11 +1345,6 @@ def _fetch_visible_devices_env_values(self):
13331345
devices[dev.manufacturer].append(dev)
13341346

13351347
if devices:
1336-
value_with_index = (
1337-
envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_MODE.lower()
1338-
== "index"
1339-
)
1340-
13411348
for manu, devs in devices.items():
13421349
backend = manufacturer_to_backend(manu)
13431350
rk = envs.GPUSTACK_RUNTIME_DETECT_BACKEND_MAP_RESOURCE_KEY.get(backend)
@@ -1348,18 +1355,30 @@ def _fetch_visible_devices_env_values(self):
13481355
rk,
13491356
)
13501357
if ren and ben:
1358+
dev_uuids: list[str] = []
1359+
dev_indexes: list[str] = []
1360+
for dev in devs:
1361+
dev_uuids.append(dev.uuid)
1362+
dev_indexes.append(str(dev.index))
1363+
dev_indexes_alignment: dict[str, str] = {
1364+
dev_indexes[i]: str(i) for i in range(len(devs))
1365+
}
13511366
self._visible_devices_env[ren] = ben
1352-
self._visible_devices_values[ren] = [
1353-
(str(dev.index) if value_with_index else dev.uuid)
1354-
for dev in devs
1355-
]
1367+
self._visible_devices_values[ren] = (
1368+
dev_uuids
1369+
if ren
1370+
in envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
1371+
else dev_indexes
1372+
)
1373+
self._visible_devices_values_alignment[ren] = dev_indexes_alignment
13561374

13571375
if self._visible_devices_env:
13581376
return
13591377

13601378
# Fallback to unknown backend
13611379
self._visible_devices_env["UNKNOWN_RUNTIME_VISIBLE_DEVICES"] = []
13621380
self._visible_devices_values["UNKNOWN_RUNTIME_VISIBLE_DEVICES"] = ["all"]
1381+
self._visible_devices_values_alignment["UNKNOWN_RUNTIME_VISIBLE_DEVICES"] = {}
13631382

13641383
def visible_devices_env_values(
13651384
self,
@@ -1389,6 +1408,44 @@ def visible_devices_env_values(
13891408
self._fetch_visible_devices_env_values()
13901409
return self._visible_devices_env, self._visible_devices_values
13911410

1411+
def align_backend_visible_devices_env_values(
1412+
self,
1413+
backend_visible_devices_env: str,
1414+
resource_key_values: str,
1415+
) -> str:
1416+
"""
1417+
Return the aligned backend visible devices environment variable values.
1418+
For example, if the backend visible devices env is "ASCEND_RT_VISIBLE_DEVICES",
1419+
and the `resource_key_values` is "4,6", and the detected devices are with indexes
1420+
[4,5,6,7], then the aligned result will be "0,2".
1421+
1422+
Args:
1423+
backend_visible_devices_env:
1424+
The backend visible devices environment variable name.
1425+
resource_key_values:
1426+
The resource key values to align.
1427+
1428+
Returns:
1429+
The aligned backend visible devices environment variable values.
1430+
If no alignment is needed, return the original `resource_key_values`.
1431+
1432+
"""
1433+
if (
1434+
backend_visible_devices_env
1435+
not in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1436+
):
1437+
return resource_key_values
1438+
self._fetch_visible_devices_env_values()
1439+
return ",".join(
1440+
[
1441+
self._visible_devices_values_alignment[backend_visible_devices_env].get(
1442+
v,
1443+
v,
1444+
)
1445+
for v in resource_key_values.split(",")
1446+
],
1447+
)
1448+
13921449
@property
13931450
def name(self) -> str:
13941451
"""

gpustack_runtime/deployer/docker.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,15 +1033,20 @@ def _create_containers(
10331033
# Set to "all" if no specific devices detected,
10341034
# maybe the container backend can handle it.
10351035
create_options["environment"][re] = (
1036-
r_v
1036+
str(r_v)
10371037
if not privileged
10381038
else (",".join(vd_values.get(re, [])) or "all")
10391039
)
10401040

10411041
# Configure runtime device access environment variables.
10421042
if r_v != "all" and privileged:
10431043
for be in backend_env:
1044-
create_options["environment"][be] = r_v
1044+
create_options["environment"][be] = (
1045+
self.align_backend_visible_devices_env_values(
1046+
be,
1047+
str(r_v),
1048+
)
1049+
)
10451050

10461051
# Parameterize mounts.
10471052
self._append_container_mounts(

gpustack_runtime/deployer/kuberentes.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,7 @@ def _create_pod(
890890
kubernetes.client.V1EnvVar(
891891
name=re,
892892
value=(
893-
r_v
893+
str(r_v)
894894
if not privileged
895895
else (
896896
",".join(vd_values.get(re, [])) or "all"
@@ -905,7 +905,10 @@ def _create_pod(
905905
container.env.append(
906906
kubernetes.client.V1EnvVar(
907907
name=be,
908-
value=str(r_v),
908+
value=self.align_backend_visible_devices_env_values(
909+
be,
910+
str(r_v),
911+
),
909912
),
910913
)
911914

gpustack_runtime/envs.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,17 @@
117117
e.g., `{"nvidia.com/devices": ["CUDA_VISIBLE_DEVICES"], "amd.com/devices": ["ROCR_VISIBLE_DEVICES"]}`.
118118
The key is the resource key, and the value is a list of environment variable names.
119119
"""
120-
GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_MODE: str | None = None
120+
GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID: set[str] | None = None
121121
"""
122-
Mode for valuing runtime visible devices environment variables (options: Index or UUID).
122+
Use UUIDs for the given runtime visible devices environment variables.
123+
"""
124+
GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT: set[str] | None = (
125+
None
126+
)
127+
"""
128+
Enable value alignment for the given backend visible devices environment variables.
129+
When detected devices are considered to be partially mapped (starting from a non-zero value or not contiguous),
130+
alignment is performed to ensure they are correctly identified.
123131
"""
124132

125133
# Detector
@@ -277,13 +285,18 @@
277285
),
278286
list_sep=",",
279287
),
280-
"GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_MODE": lambda: choice(
288+
"GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID": lambda: to_set(
289+
getenv(
290+
"GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID",
291+
),
292+
sep=",",
293+
),
294+
"GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT": lambda: to_set(
281295
getenv(
282-
"GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_MODE",
283-
"Index",
296+
"GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT",
297+
"ASCEND_RT_VISIBLE_DEVICES,NPU_VISIBLE_DEVICES",
284298
),
285-
options=["Index", "UUID"],
286-
default="Index",
299+
sep=",",
287300
),
288301
# Detector
289302
"GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY": lambda: to_bool(

0 commit comments

Comments
 (0)