perf: let noromaid mixtral scale to zero (#87)

sambarnes · web-flow · commit 5202823fd8e1 · 2024-04-08T10:07:52.000-06:00
* perf: let noromaid mixtral scale to zero

* fix: change max container to 1 too
diff --git a/modal/runner/containers/vllm_unified.py b/modal/runner/containers/vllm_unified.py
@@ -152,8 +152,7 @@ def __init__(self):
     model_name=_noromaid,
     gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=4,
-    max_containers=3,
-    keep_warm=1,
+    max_containers=1,
     quantization="GPTQ",
     dtype="float16",  # vLLM errors when using dtype="auto" with this model
 )