33
44You can do this with `start-fastapi-server`.
55"""
6+
67import argparse
78import subprocess
89from typing import List
910
11+ # Uvicorn returns 503 instead of 429 when concurrency exceeds the limit
12+ # We'll autoscale at target concurrency of a much lower number (around 50), and this just makes sure we don't 503 with bursty traffic
13+ # We set this very high since model_engine_server/api/app.py sets a lower per-pod concurrency at which we start returning 429s
14+ CONCURRENCY_LIMIT = 10000
15+
1016
11- def start_gunicorn_server (port : int , num_workers : int , debug : bool ) -> None :
12- """Starts a GUnicorn server locally."""
17+ def start_uvicorn_server (port : int , debug : bool ) -> None :
18+ """Starts a Uvicorn server locally."""
1319 additional_args : List [str ] = []
1420 if debug :
15- additional_args .extend (["--reload" , "--timeout" , "0" ])
21+ additional_args .extend (["--reload" , "--timeout-graceful-shutdown " , "0" ])
1622 command = [
17- "gunicorn" ,
18- "--bind" ,
19- f"[::]:{ port } " ,
20- "--timeout" ,
23+ "uvicorn" ,
24+ "--host" ,
25+ "::" ,
26+ "--port" ,
27+ f"{ port } " ,
28+ "--timeout-graceful-shutdown" ,
2129 "60" ,
22- "--keep-alive" ,
30+ "--timeout- keep-alive" ,
2331 "2" ,
24- "--worker-class" ,
25- "model_engine_server.api.worker.LaunchWorker" ,
32+ # uvloop and httptools are both faster than their alternatives, but they are not compatible
33+ # with Windows or PyPy.
34+ "--loop" ,
35+ "uvloop" ,
36+ "--http" ,
37+ "httptools" ,
38+ "--limit-concurrency" ,
39+ f"{ CONCURRENCY_LIMIT } " ,
2640 "--workers" ,
27- f" { num_workers } " ,
41+ "1" , # Let the Kubernetes deployment handle the number of pods
2842 * additional_args ,
2943 "model_engine_server.api.app:app" ,
3044 ]
@@ -38,11 +52,10 @@ def entrypoint():
3852 # We can probably use asyncio since this service is going to be more I/O bound.
3953 parser = argparse .ArgumentParser (description = "Hosted Inference Server" )
4054 parser .add_argument ("--port" , type = int , default = 5000 )
41- parser .add_argument ("--num-workers" , type = int , default = 4 )
4255 parser .add_argument ("--debug" , "-d" , action = "store_true" )
4356 args = parser .parse_args ()
4457
45- start_gunicorn_server (args .port , args . num_workers , args .debug )
58+ start_uvicorn_server (args .port , args .debug )
4659
4760
4861if __name__ == "__main__" :
0 commit comments