perf: let bagel scale to zero (#85)

sambarnes · web-flow · commit 9c099b327eaa · 2024-04-04T12:03:57.000-06:00
diff --git a/modal/runner/containers/vllm_unified.py b/modal/runner/containers/vllm_unified.py
@@ -165,7 +165,6 @@ def __init__(self):
     gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=4,
     max_containers=1,
-    keep_warm=1,
     max_model_len=8_000,  # Reduced from original 200k
     quantization="GPTQ",
     dtype="float16",  # vLLM errors when using dtype="auto" with this model