Skip to content

Commit c7748c1

Browse files
committed
refactor: docker container status parsing
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent 8a577d1 commit c7748c1

File tree

2 files changed

+64
-31
lines changed

2 files changed

+64
-31
lines changed

gpustack_runtime/cmds/deployer.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def register(parser: _SubParsersAction):
123123
deploy_parser.add_argument(
124124
"--check",
125125
action="store_true",
126-
help="enable health check (default: False)",
126+
help="enable health check, needs --port (default: False)",
127127
default=False,
128128
)
129129

@@ -234,7 +234,11 @@ def run(self):
234234
host_network=self.host_network,
235235
containers=[
236236
Container(
237-
restart_policy=ContainerRestartPolicyEnum.NEVER,
237+
restart_policy=(
238+
ContainerRestartPolicyEnum.NEVER
239+
if not self.check
240+
else ContainerRestartPolicyEnum.ALWAYS
241+
),
238242
image=f"gpustack/runner:{self.backend if self.backend else 'Host'}X.Y-{self.service}{self.version}",
239243
name="default",
240244
envs=env,
@@ -332,7 +336,7 @@ def register(parser: _SubParsersAction):
332336
deploy_parser.add_argument(
333337
"--check",
334338
action="store_true",
335-
help="enable health check (default: False)",
339+
help="enable health check, needs --port (default: False)",
336340
default=False,
337341
)
338342

@@ -442,7 +446,11 @@ def run(self):
442446
host_network=self.host_network,
443447
containers=[
444448
Container(
445-
restart_policy=ContainerRestartPolicyEnum.NEVER,
449+
restart_policy=(
450+
ContainerRestartPolicyEnum.NEVER
451+
if not self.check
452+
else ContainerRestartPolicyEnum.ALWAYS
453+
),
446454
image=self.image,
447455
name="default",
448456
envs=env,

gpustack_runtime/deployer/docker.py

Lines changed: 52 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -165,35 +165,55 @@ def parse_state(
165165
if not d_run_containers:
166166
if not d_init_containers:
167167
return WorkloadStatusStateEnum.UNKNOWN
168-
return WorkloadStatusStateEnum.PENDING
168+
return WorkloadStatusStateEnum.INACTIVE
169169

170+
d_run_state = WorkloadStatusStateEnum.RUNNING
170171
for cr in d_run_containers:
171-
if cr.status == "created":
172-
if not d_init_containers:
173-
return WorkloadStatusStateEnum.PENDING
174-
for ci in d_init_containers or []:
175-
if ci.status == "created":
176-
return WorkloadStatusStateEnum.PENDING
177-
if ci.status == "dead" or (
178-
ci.status == "exited" and ci.attrs["State"]["ExitCode"] != 0
179-
):
180-
return WorkloadStatusStateEnum.FAILED
181-
if ci.status != "exited" and not _has_restart_policy(ci):
182-
return WorkloadStatusStateEnum.INITIALIZING
183-
return WorkloadStatusStateEnum.INITIALIZING
184-
if cr.status == "dead" or (
185-
cr.status == "exited" and cr.attrs["State"]["ExitCode"] != 0
186-
):
187-
if not _has_restart_policy(cr):
188-
return WorkloadStatusStateEnum.FAILED
172+
if cr.status == "dead":
173+
return WorkloadStatusStateEnum.FAILED
174+
if cr.status == "exited":
175+
if cr.attrs["State"].get("ExitCode", 1) != 0:
176+
return (
177+
WorkloadStatusStateEnum.FAILED
178+
if not _has_restart_policy(cr)
179+
else WorkloadStatusStateEnum.UNHEALTHY
180+
)
181+
return WorkloadStatusStateEnum.INACTIVE
182+
if cr.status == "paused":
183+
return WorkloadStatusStateEnum.INACTIVE
184+
if cr.status in ["restarting", "removing"]:
189185
return WorkloadStatusStateEnum.UNHEALTHY
190-
if cr.status != "running" and not _has_restart_policy(cr):
186+
if cr.status == "created":
187+
d_run_state = WorkloadStatusStateEnum.PENDING
188+
else:
189+
health = cr.attrs["State"].get("Health", {})
190+
if health and health.get("Status", "healthy") != "healthy":
191+
return WorkloadStatusStateEnum.UNHEALTHY
192+
193+
d_init_state = None
194+
for ci in d_init_containers or []:
195+
if ci.status == "dead":
196+
return WorkloadStatusStateEnum.FAILED
197+
if ci.status == "exited":
198+
if ci.attrs["State"].get("ExitCode", 1) != 0:
199+
return (
200+
WorkloadStatusStateEnum.FAILED
201+
if not _has_restart_policy(ci)
202+
else WorkloadStatusStateEnum.UNHEALTHY
203+
)
204+
elif ci.status in ["paused", "removing"]:
205+
if _has_restart_policy(ci):
206+
return WorkloadStatusStateEnum.UNHEALTHY
207+
elif ci.status == "restarting":
208+
if _has_restart_policy(ci):
209+
return WorkloadStatusStateEnum.UNHEALTHY
210+
d_init_state = WorkloadStatusStateEnum.INITIALIZING
211+
elif ci.status == "created":
191212
return WorkloadStatusStateEnum.PENDING
192-
health = cr.attrs["State"].get("Health", {})
193-
if health and health.get("Status") != "healthy":
194-
return WorkloadStatusStateEnum.UNHEALTHY
213+
elif not _has_restart_policy(ci):
214+
d_init_state = WorkloadStatusStateEnum.INITIALIZING
195215

196-
return WorkloadStatusStateEnum.RUNNING
216+
return d_init_state if d_init_state else d_run_state
197217

198218
def __init__(
199219
self,
@@ -1024,13 +1044,16 @@ def _create_containers(
10241044
def _start_containers(
10251045
container: docker.models.containers.Container
10261046
| list[docker.models.containers.Container],
1047+
force: bool = True,
10271048
):
10281049
"""
10291050
Start or restart the container(s) based on their current status.
10301051
10311052
Args:
10321053
container:
10331054
A Docker container or a list of Docker containers to start or restart.
1055+
force:
1056+
To force restart or unpause the container if it's in exited or paused status.
10341057
10351058
Raises:
10361059
docker.errors.APIError:
@@ -1046,9 +1069,11 @@ def _start_containers(
10461069
case "created":
10471070
container.start()
10481071
case "exited" | "dead":
1049-
container.restart()
1072+
if force:
1073+
container.restart()
10501074
case "paused":
1051-
container.unpause()
1075+
if force:
1076+
container.unpause()
10521077

10531078
def __init__(self):
10541079
super().__init__(_NAME)
@@ -1368,7 +1393,7 @@ def _create(self, workload: WorkloadPlan):
13681393
# Start containers in order: pause -> init(s) -> run(s) -> unhealthy restart
13691394
try:
13701395
self._start_containers(pause_container)
1371-
self._start_containers(init_containers)
1396+
self._start_containers(init_containers, force=False)
13721397
self._start_containers(run_containers)
13731398
if unhealthy_restart_container:
13741399
self._start_containers(unhealthy_restart_container)

0 commit comments

Comments
 (0)