Skip to content
This repository was archived by the owner on Sep 6, 2025. It is now read-only.

Commit 54cf516

Browse files
authored
perf: reduce container idle timeout for neuralchat & psyfighter1 (#84)
1 parent af26dd5 commit 54cf516

File tree

1 file changed

+4
-1
lines changed

1 file changed

+4
-1
lines changed

modal/runner/containers/vllm_unified.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def _make_container(
2424
gpu: modal.gpu = modal.gpu.A100(count=1, memory=40),
2525
concurrent_inputs: int = 8,
2626
max_containers: int = None,
27+
container_idle_timeout: int = 20 * 60, # 20 minutes
2728
keep_warm: int = None,
2829
**vllm_opts,
2930
):
@@ -88,7 +89,7 @@ def __init__(self):
8889
memory=1024,
8990
gpu=gpu,
9091
allow_concurrent_inputs=concurrent_inputs,
91-
container_idle_timeout=20 * 60,
92+
container_idle_timeout=container_idle_timeout,
9293
timeout=10 * 60,
9394
secrets=[*get_observability_secrets()],
9495
concurrency_limit=max_containers,
@@ -120,6 +121,7 @@ def __init__(self):
120121
gpu=modal.gpu.A10G(count=1),
121122
concurrent_inputs=4,
122123
max_containers=5,
124+
container_idle_timeout=2 * 60,
123125
quantization="GPTQ",
124126
)
125127

@@ -130,6 +132,7 @@ def __init__(self):
130132
gpu=modal.gpu.A10G(count=1),
131133
concurrent_inputs=4,
132134
max_containers=5,
135+
container_idle_timeout=2 * 60,
133136
quantization="GPTQ",
134137
)
135138

0 commit comments

Comments
 (0)