Skip to content

Commit f81fe77

Browse files
committed
refactor: exec command
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent ea6b489 commit f81fe77

File tree

5 files changed

+90
-84
lines changed

5 files changed

+90
-84
lines changed

gpustack_runtime/cmds/deployer.py

Lines changed: 35 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,39 @@
3737
if TYPE_CHECKING:
3838
from argparse import Namespace, _SubParsersAction
3939

40+
_IGNORE_ENVS = (
41+
"PATH",
42+
"HOME",
43+
"LANG",
44+
"PWD",
45+
"SHELL",
46+
"LOG",
47+
"XDG",
48+
"XPC",
49+
"SSH",
50+
"LC",
51+
"LS",
52+
"_",
53+
"USER",
54+
"TERM",
55+
"LESS",
56+
"SHLVL",
57+
"DBUS",
58+
"OLDPWD",
59+
"MOTD",
60+
"LD",
61+
"LIB",
62+
"PS1",
63+
"PY",
64+
"VIRTUAL_ENV",
65+
"CONDA",
66+
"PAGE",
67+
"ZSH",
68+
"COMMAND_MODE",
69+
"TMPDIR",
70+
"GPUSTACK_",
71+
)
72+
4073

4174
class CreateRunnerWorkloadSubCommand(SubCommand):
4275
"""
@@ -175,32 +208,7 @@ def run(self):
175208
value=value,
176209
)
177210
for name, value in os.environ.items()
178-
if not name.startswith(
179-
(
180-
"PATH",
181-
"HOME",
182-
"LANG",
183-
"PWD",
184-
"SHELL",
185-
"LOG",
186-
"XDG",
187-
"XPC",
188-
"SSH",
189-
"LC",
190-
"LS",
191-
"_",
192-
"USER",
193-
"TERM",
194-
"LESS",
195-
"SHLVL",
196-
"DBUS",
197-
"OLDPWD",
198-
"MOTD",
199-
"LD",
200-
"LIB",
201-
"GPUSTACK_",
202-
),
203-
)
211+
if not name.startswith(_IGNORE_ENVS)
204212
]
205213
if self.backend:
206214
resources = ContainerResources(
@@ -377,32 +385,7 @@ def run(self):
377385
value=value,
378386
)
379387
for name, value in os.environ.items()
380-
if not name.startswith(
381-
(
382-
"PATH",
383-
"HOME",
384-
"LANG",
385-
"PWD",
386-
"SHELL",
387-
"LOG",
388-
"XDG",
389-
"XPC",
390-
"SSH",
391-
"LC",
392-
"LS",
393-
"_",
394-
"USER",
395-
"TERM",
396-
"LESS",
397-
"SHLVL",
398-
"DBUS",
399-
"OLDPWD",
400-
"MOTD",
401-
"LD",
402-
"LIB",
403-
"GPUSTACK_",
404-
),
405-
)
388+
if not name.startswith(_IGNORE_ENVS)
406389
]
407390
if self.backend:
408391
resources = ContainerResources(

gpustack_runtime/deployer/__types__.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -736,7 +736,7 @@ class WorkloadPlan(WorkloadSecurity):
736736
"""
737737

738738
resource_key_runtime_env_mapping: dict[str, str] = field(
739-
default_factory=lambda: envs.GPUSTACK_RUNTIME_DEPLOY_MAP_RUNTIME_VISIBLE_DEVICES,
739+
default_factory=lambda: envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES,
740740
)
741741
"""
742742
Mapping from resource names to environment variable names for device allocation,
@@ -746,7 +746,7 @@ class WorkloadPlan(WorkloadSecurity):
746746
With privileged mode, the container can access all GPUs even if specified.
747747
"""
748748
resource_key_backend_env_mapping: dict[str, list[str]] = field(
749-
default_factory=lambda: envs.GPUSTACK_RUNTIME_DEPLOY_MAP_BACKEND_VISIBLE_DEVICES,
749+
default_factory=lambda: envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_BACKEND_VISIBLE_DEVICES,
750750
)
751751
"""
752752
Mapping from resource names to environment variable names for device runtime,
@@ -918,8 +918,10 @@ class WorkloadStatus:
918918
Attributes:
919919
name (WorkloadName):
920920
Name for the workload, it should be unique in the deployer.
921-
created_at (str | None):
921+
created_at str:
922922
Creation time of the workload.
923+
namespace (WorkloadNamespace | None):
924+
Namespace for the workload.
923925
labels (dict[str, str] | None):
924926
Labels for the workload.
925927
executable (list[WorkloadStatusOperation]):
@@ -940,6 +942,10 @@ class WorkloadStatus:
940942
"""
941943
Creation time of the workload.
942944
"""
945+
namespace: WorkloadNamespace | None = None
946+
"""
947+
Namespace for the workload.
948+
"""
943949
labels: dict[str, str] | None = field(default_factory=dict)
944950
"""
945951
Labels for the workload.
@@ -978,11 +984,11 @@ def fileno(self) -> int:
978984
raise NotImplementedError
979985

980986
@abstractmethod
981-
def read(self, size: int = -1) -> bytes | str | None:
987+
def read(self, size: int = -1) -> bytes | None:
982988
raise NotImplementedError
983989

984990
@abstractmethod
985-
def write(self, data: bytes | str) -> int:
991+
def write(self, data: bytes) -> int:
986992
raise NotImplementedError
987993

988994
@abstractmethod
@@ -1016,12 +1022,16 @@ def __init__(self):
10161022

10171023
if backend := detect_backend():
10181024
rk = envs.GPUSTACK_RUNTIME_DETECT_BACKEND_MAP_RESOURCE_KEY.get(backend)
1019-
re = envs.GPUSTACK_RUNTIME_DEPLOY_MAP_RUNTIME_VISIBLE_DEVICES.get(rk)
1020-
be = envs.GPUSTACK_RUNTIME_DEPLOY_MAP_BACKEND_VISIBLE_DEVICES.get(rk)
1021-
if re:
1022-
self._runtime_visible_devices_env_name = re
1023-
if be:
1024-
self._backend_visible_devices_env_names = be
1025+
ren = envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES.get(
1026+
rk,
1027+
)
1028+
ben = envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_BACKEND_VISIBLE_DEVICES.get(
1029+
rk,
1030+
)
1031+
if ren:
1032+
self._runtime_visible_devices_env_name = ren
1033+
if ben:
1034+
self._backend_visible_devices_env_names = ben
10251035

10261036
@staticmethod
10271037
@abstractmethod

gpustack_runtime/deployer/docker.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1377,20 +1377,16 @@ def closed(self) -> bool:
13771377
return not (self._sock and not self._sock.closed)
13781378

13791379
def fileno(self) -> int:
1380-
if self.closed:
1381-
return -1
13821380
return self._sock.fileno()
13831381

1384-
def read(self, size=-1) -> bytes | str | None:
1382+
def read(self, size=-1) -> bytes | None:
13851383
if self.closed:
13861384
return None
13871385
return self._sock.read(size)
13881386

1389-
def write(self, data: bytes | str) -> int:
1387+
def write(self, data: bytes) -> int:
13901388
if self.closed:
13911389
return 0
1392-
if isinstance(data, str):
1393-
data = data.encode("utf-8")
13941390
return self._sock.write(data)
13951391

13961392
def close(self):

gpustack_runtime/deployer/kuberentes.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@ def __init__(
188188
**kwargs,
189189
):
190190
created_at = k_pod.metadata.creation_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
191+
namespace = (
192+
k_pod.metadata.namespace
193+
if k_pod.metadata.namespace != envs.GPUSTACK_RUNTIME_KUBERNETES_NAMESPACE
194+
else None
195+
)
191196
labels = {
192197
k: v
193198
for k, v in (k_pod.metadata.labels or {}).items()
@@ -197,6 +202,7 @@ def __init__(
197202
super().__init__(
198203
name=name,
199204
created_at=created_at,
205+
namespace=namespace,
200206
labels=labels,
201207
**kwargs,
202208
)
@@ -1559,25 +1565,34 @@ class KubernetesWorkloadExecStream(WorkloadExecStream):
15591565
def __init__(self, ws: kubernetes.stream.ws_client.WSClient):
15601566
super().__init__()
15611567
self._ws = ws
1568+
self._ws.run_forever(timeout=1)
1569+
# self._ws.write_stdin(self._ws.newline)
15621570

15631571
@property
15641572
def closed(self) -> bool:
15651573
return not (self._ws and self._ws.is_open())
15661574

15671575
def fileno(self) -> int:
1568-
if self.closed:
1569-
return -1
15701576
return self._ws.sock.fileno()
15711577

1572-
def read(self, *_) -> bytes | str | None:
1578+
def recv(self, size=-1) -> bytes | None:
1579+
return self.read(size)
1580+
1581+
def send(self, data: bytes) -> int:
1582+
return self.write(data)
1583+
1584+
def read(self, *_) -> bytes | None:
15731585
if self.closed:
15741586
return None
1575-
raise NotImplementedError
1587+
self._ws.update(timeout=1)
1588+
return self._ws.read_all().encode("utf-8", errors="replace")
15761589

1577-
def write(self, *_) -> int:
1590+
def write(self, data: bytes) -> int:
15781591
if self.closed:
15791592
return 0
1580-
raise NotImplementedError
1593+
data_len = len(data)
1594+
self._ws.write_stdin(data.decode("utf-8", errors="replace"))
1595+
return data_len
15811596

15821597
def close(self):
15831598
if not self.closed:

gpustack_runtime/envs.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,18 @@
4040
which is used to tell deployer do a device detection and get the corresponding resource key before mapping.
4141
e.g., "gpustack.ai/devices".
4242
"""
43-
GPUSTACK_RUNTIME_DEPLOY_MAP_RUNTIME_VISIBLE_DEVICES: dict[str, str] | None = None
43+
GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES: (
44+
dict[str, str] | None
45+
) = None
4446
"""
45-
Manual mapping of container backend visible devices environment variables,
47+
Manual mapping of runtime visible devices environment variables,
4648
which is used to tell the Container Runtime which GPUs to mount into the container,
4749
e.g., `{"nvidia.com/devices": "NVIDIA_VISIBLE_DEVICES", "amd.com/devices": "AMD_VISIBLE_DEVICES"}`.
4850
The key is the resource key, and the value is the environment variable name.
4951
"""
50-
GPUSTACK_RUNTIME_DEPLOY_MAP_BACKEND_VISIBLE_DEVICES: dict[str, list[str]] | None = (
51-
None
52-
)
52+
GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_BACKEND_VISIBLE_DEVICES: (
53+
dict[str, list[str]] | None
54+
) = None
5355
"""
5456
Manual mapping of backend visible devices environment variables,
5557
which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container,
@@ -137,9 +139,9 @@
137139
"GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY",
138140
"gpustack.ai/devices",
139141
),
140-
"GPUSTACK_RUNTIME_DEPLOY_MAP_RUNTIME_VISIBLE_DEVICES": lambda: to_dict(
142+
"GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES": lambda: to_dict(
141143
getenv(
142-
"GPUSTACK_RUNTIME_DEPLOY_MAP_RUNTIME_VISIBLE_DEVICES",
144+
"GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES",
143145
"nvidia.com/devices=NVIDIA_VISIBLE_DEVICES;"
144146
"amd.com/devices=AMD_VISIBLE_DEVICES;"
145147
"huawei.com/devices=ASCEND_VISIBLE_DEVICES;"
@@ -149,9 +151,9 @@
149151
"cambricon.com/devices=CAMBRICON_VISIBLE_DEVICES;",
150152
),
151153
),
152-
"GPUSTACK_RUNTIME_DEPLOY_MAP_BACKEND_VISIBLE_DEVICES": lambda: to_dict(
154+
"GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_BACKEND_VISIBLE_DEVICES": lambda: to_dict(
153155
getenv(
154-
"GPUSTACK_RUNTIME_DEPLOY_MAP_BACKEND_VISIBLE_DEVICES",
156+
"GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_BACKEND_VISIBLE_DEVICES",
155157
"nvidia.com/devices=CUDA_VISIBLE_DEVICES;"
156158
"amd.com/devices=ROCR_VISIBLE_DEVICES;"
157159
"huawei.com/devices=ASCEND_RT_VISIBLE_DEVICES,NPU_VISIBLE_DEVICES;"

0 commit comments

Comments
 (0)