Skip to content

Commit 7c1eb3b

Browse files
committed
Remove Gunicorn and use Uvicorn only for gateway
1 parent 0fbb062 commit 7c1eb3b

File tree

3 files changed

+27
-28
lines changed

3 files changed

+27
-28
lines changed

model-engine/model_engine_server/api/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
logger = make_logger(logger_name())
3535

36-
# Allows us to make the Uvicorn worker concurrency in model_engine_server/api/worker.py very high
36+
# See also Uvicorn worker concurrency in model_engine_server/api/worker.py
3737
MAX_CONCURRENCY = 500
3838

3939
concurrency_limiter = MultiprocessingConcurrencyLimiter(

model-engine/model_engine_server/api/worker.py

Lines changed: 0 additions & 14 deletions
This file was deleted.

model-engine/model_engine_server/entrypoints/start_fastapi_server.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,42 @@
33
44
You can do this with `start-fastapi-server`.
55
"""
6+
67
import argparse
78
import subprocess
89
from typing import List
910

11+
# Uvicorn returns 503 instead of 429 when concurrency exceeds the limit
12+
# We'll autoscale at target concurrency of a much lower number (around 50), and this just makes sure we don't 503 with bursty traffic
13+
# We set this very high since model_engine_server/api/app.py sets a lower per-pod concurrency at which we start returning 429s
14+
CONCURRENCY_LIMIT = 10000
15+
1016

11-
def start_gunicorn_server(port: int, num_workers: int, debug: bool) -> None:
12-
"""Starts a GUnicorn server locally."""
17+
def start_uvicorn_server(port: int, debug: bool) -> None:
18+
"""Starts a Uvicorn server locally."""
1319
additional_args: List[str] = []
1420
if debug:
15-
additional_args.extend(["--reload", "--timeout", "0"])
21+
additional_args.extend(["--reload", "--timeout-graceful-shutdown", "0"])
1622
command = [
17-
"gunicorn",
18-
"--bind",
19-
f"[::]:{port}",
20-
"--timeout",
23+
"uvicorn",
24+
"--host",
25+
"::",
26+
"--port",
27+
f"{port}",
28+
"--timeout-graceful-shutdown",
2129
"60",
22-
"--keep-alive",
30+
"--timeout-keep-alive",
2331
"2",
24-
"--worker-class",
25-
"model_engine_server.api.worker.LaunchWorker",
32+
# uvloop and httptools are both faster than their alternatives, but they are not compatible
33+
# with Windows or PyPy.
34+
"--loop",
35+
"uvloop",
36+
"--http",
37+
"httptools",
38+
"--limit-concurrency",
39+
f"{CONCURRENCY_LIMIT}",
2640
"--workers",
27-
f"{num_workers}",
41+
"1", # Let the Kubernetes deployment handle the number of pods
2842
*additional_args,
2943
"model_engine_server.api.app:app",
3044
]
@@ -38,11 +52,10 @@ def entrypoint():
3852
# We can probably use asyncio since this service is going to be more I/O bound.
3953
parser = argparse.ArgumentParser(description="Hosted Inference Server")
4054
parser.add_argument("--port", type=int, default=5000)
41-
parser.add_argument("--num-workers", type=int, default=4)
4255
parser.add_argument("--debug", "-d", action="store_true")
4356
args = parser.parse_args()
4457

45-
start_gunicorn_server(args.port, args.num_workers, args.debug)
58+
start_uvicorn_server(args.port, args.debug)
4659

4760

4861
if __name__ == "__main__":

0 commit comments

Comments
 (0)