|
11 | 11 | # and it is supported by open source LLM serving frameworks like [vLLM](https://docs.vllm.ai/en/latest/). |
12 | 12 |
|
13 | 13 | # In this example, we show how to run a vLLM server in OpenAI-compatible mode on Modal. |
14 | | -# You can find a video walkthrough of this example on our YouTube channel [here](https://www.youtube.com/watch?v=QmY_7ePR1hM). |
15 | | - |
16 | | -# Note that the vLLM server is a FastAPI app, which can be configured and extended just like any other. |
17 | | -# Here, we use it to add simple authentication middleware, following the |
18 | | -# [implementation in the vLLM repository](https://github.com/vllm-project/vllm/blob/v0.5.3post1/vllm/entrypoints/openai/api_server.py). |
19 | 14 |
|
20 | 15 | # Our examples repository also includes scripts for running clients and load-testing for OpenAI-compatible APIs |
21 | 16 | # [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/llm-serving/openai_compatible). |
|
31 | 26 |
|
32 | 27 | import modal |
33 | 28 |
|
34 | | -vllm_image = modal.Image.debian_slim(python_version="3.12").pip_install( |
35 | | - "vllm==0.6.3post1", "fastapi[standard]==0.115.4" |
| 29 | +vllm_image = ( |
| 30 | + modal.Image.debian_slim(python_version="3.12") |
| 31 | + .pip_install( |
| 32 | + "vllm==0.7.2", |
| 33 | + "huggingface_hub[hf_transfer]==0.26.2", |
| 34 | + "flashinfer-python==0.2.0.post2", # pinning, very unstable |
| 35 | + extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5", |
| 36 | + ) |
| 37 | + .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "VLLM_USE_V1": "1"}) |
36 | 38 | ) |
37 | 39 |
|
38 | 40 | # ## Download the model weights |
|
48 | 50 | MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16" |
49 | 51 | MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d" |
50 | 52 |
|
51 | | -# We need to make the weights of that model available to our Modal Functions. |
52 | | - |
53 | | -# So to follow along with this example, you'll need to download those weights |
54 | | -# onto a Modal Volume by running another script from the |
55 | | -# [examples repository](https://github.com/modal-labs/modal-examples). |
56 | | - |
57 | | -try: |
58 | | - volume = modal.Volume.from_name("llamas", create_if_missing=False).hydrate() |
59 | | -except modal.exception.NotFoundError: |
60 | | - raise Exception("Download models first with modal run download_llama.py") |
| 53 | +# Although vLLM will download weights on-demand, we want to cache them if possible. We'll use [Modal Volumes](https://modal.com/docs/guide/volumes) |
| 54 | +# as a cache, which act as a "shared disk" that all Functions can access. |
| 55 | +hf_cache_vol = modal.Volume.from_name( |
| 56 | + "huggingface-cache", create_if_missing=True |
| 57 | +) |
| 58 | +vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True) |
61 | 59 |
|
62 | 60 |
|
63 | 61 | # ## Build a vLLM engine and serve it |
64 | 62 |
|
65 | | -# vLLM's OpenAI-compatible server is exposed as a [FastAPI](https://fastapi.tiangolo.com/) router. |
66 | | - |
67 | | -# FastAPI is a Python web framework that implements the [ASGI standard](https://en.wikipedia.org/wiki/Asynchronous_Server_Gateway_Interface), |
68 | | -# much like [Flask](https://en.wikipedia.org/wiki/Flask_(web_framework)) is a Python web framework |
69 | | -# that implements the [WSGI standard](https://en.wikipedia.org/wiki/Web_Server_Gateway_Interface). |
70 | | - |
71 | | -# Modal offers [first-class support for ASGI (and WSGI) apps](https://modal.com/docs/guide/webhooks). We just need to decorate a function that returns the app |
72 | | -# with `@modal.asgi_app()` (or `@modal.wsgi_app()`) and then add it to the Modal app with the `app.function` decorator. |
73 | | - |
74 | | -# The function below first imports the FastAPI router from the vLLM library, then adds authentication compatible with OpenAI client libraries. You might also add more routes here. |
75 | | - |
76 | | -# Then, the function creates an `AsyncLLMEngine`, the core of the vLLM server. It's responsible for loading the model, running inference, and serving responses. |
| 63 | +# We start a vLLM server as a subprocess, which Modal has [first-class support for](https://modal.com/docs/reference/modal.web_server). We configure Modal |
| 64 | +# to forward requests to port 8000. |
77 | 65 |
|
78 | | -# After attaching that engine to the FastAPI app via the `api_server` module of the vLLM library, we return the FastAPI app |
79 | | -# so it can be served on Modal. |
| 66 | +# The function below spawns a vLLM instance listening at port 8000, serving requests to our model. vLLM will authenticate requests |
| 67 | +# using the API key we provide it. |
80 | 68 |
|
81 | 69 | app = modal.App("example-vllm-openai-compatible") |
82 | 70 |
|
83 | 71 | N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count |
84 | | -TOKEN = "super-secret-token" # auth token. for production use, replace with a modal.Secret |
| 72 | +API_KEY = "super-secret-key" # api key, for auth. for production use, replace with a modal.Secret |
85 | 73 |
|
86 | 74 | MINUTES = 60 # seconds |
87 | 75 | HOURS = 60 * MINUTES |
88 | 76 |
|
| 77 | +VLLM_PORT = 8000 |
| 78 | + |
89 | 79 |
|
90 | 80 | @app.function( |
91 | 81 | image=vllm_image, |
92 | 82 | gpu=f"H100:{N_GPU}", |
93 | 83 | container_idle_timeout=5 * MINUTES, |
94 | 84 | timeout=24 * HOURS, |
95 | 85 | allow_concurrent_inputs=1000, |
96 | | - volumes={MODELS_DIR: volume}, |
| 86 | + volumes={ |
| 87 | + "/root/.cache/huggingface": hf_cache_vol, |
| 88 | + "/root/.cache/vllm": vllm_cache_vol, |
| 89 | + }, |
97 | 90 | ) |
98 | | -@modal.asgi_app() |
| 91 | +@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES) |
99 | 92 | def serve(): |
100 | | - import fastapi |
101 | | - import vllm.entrypoints.openai.api_server as api_server |
102 | | - from vllm.engine.arg_utils import AsyncEngineArgs |
103 | | - from vllm.engine.async_llm_engine import AsyncLLMEngine |
104 | | - from vllm.entrypoints.logger import RequestLogger |
105 | | - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat |
106 | | - from vllm.entrypoints.openai.serving_completion import ( |
107 | | - OpenAIServingCompletion, |
108 | | - ) |
109 | | - from vllm.entrypoints.openai.serving_engine import BaseModelPath |
110 | | - from vllm.usage.usage_lib import UsageContext |
111 | | - |
112 | | - volume.reload() # ensure we have the latest version of the weights |
113 | | - |
114 | | - # create a fastAPI app that uses vLLM's OpenAI-compatible router |
115 | | - web_app = fastapi.FastAPI( |
116 | | - title=f"OpenAI-compatible {MODEL_NAME} server", |
117 | | - description="Run an OpenAI-compatible LLM server with vLLM on modal.com 🚀", |
118 | | - version="0.0.1", |
119 | | - docs_url="/docs", |
120 | | - ) |
121 | | - |
122 | | - # security: CORS middleware for external requests |
123 | | - http_bearer = fastapi.security.HTTPBearer( |
124 | | - scheme_name="Bearer Token", |
125 | | - description="See code for authentication details.", |
126 | | - ) |
127 | | - web_app.add_middleware( |
128 | | - fastapi.middleware.cors.CORSMiddleware, |
129 | | - allow_origins=["*"], |
130 | | - allow_credentials=True, |
131 | | - allow_methods=["*"], |
132 | | - allow_headers=["*"], |
133 | | - ) |
134 | | - |
135 | | - # security: inject dependency on authed routes |
136 | | - async def is_authenticated(api_key: str = fastapi.Security(http_bearer)): |
137 | | - if api_key.credentials != TOKEN: |
138 | | - raise fastapi.HTTPException( |
139 | | - status_code=fastapi.status.HTTP_401_UNAUTHORIZED, |
140 | | - detail="Invalid authentication credentials", |
141 | | - ) |
142 | | - return {"username": "authenticated_user"} |
143 | | - |
144 | | - router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)]) |
145 | | - |
146 | | - # wrap vllm's router in auth router |
147 | | - router.include_router(api_server.router) |
148 | | - # add authed vllm to our fastAPI app |
149 | | - web_app.include_router(router) |
150 | | - |
151 | | - engine_args = AsyncEngineArgs( |
152 | | - model=MODELS_DIR + "/" + MODEL_NAME, |
153 | | - tensor_parallel_size=N_GPU, |
154 | | - gpu_memory_utilization=0.90, |
155 | | - max_model_len=8096, |
156 | | - enforce_eager=False, # capture the graph for faster inference, but slower cold starts (30s > 20s) |
157 | | - ) |
158 | | - |
159 | | - engine = AsyncLLMEngine.from_engine_args( |
160 | | - engine_args, usage_context=UsageContext.OPENAI_API_SERVER |
161 | | - ) |
162 | | - |
163 | | - model_config = get_model_config(engine) |
164 | | - |
165 | | - request_logger = RequestLogger(max_log_len=2048) |
166 | | - |
167 | | - base_model_paths = [ |
168 | | - BaseModelPath(name=MODEL_NAME.split("/")[1], model_path=MODEL_NAME) |
| 93 | + import subprocess |
| 94 | + |
| 95 | + cmd = [ |
| 96 | + "vllm", |
| 97 | + "serve", |
| 98 | + "--uvicorn-log-level=info", |
| 99 | + "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", |
| 100 | + "--host", |
| 101 | + "0.0.0.0", |
| 102 | + "--port", |
| 103 | + str(VLLM_PORT), |
| 104 | + "--api-key", |
| 105 | + API_KEY, |
169 | 106 | ] |
170 | 107 |
|
171 | | - api_server.chat = lambda s: OpenAIServingChat( |
172 | | - engine, |
173 | | - model_config=model_config, |
174 | | - base_model_paths=base_model_paths, |
175 | | - chat_template=None, |
176 | | - response_role="assistant", |
177 | | - lora_modules=[], |
178 | | - prompt_adapters=[], |
179 | | - request_logger=request_logger, |
180 | | - ) |
181 | | - api_server.completion = lambda s: OpenAIServingCompletion( |
182 | | - engine, |
183 | | - model_config=model_config, |
184 | | - base_model_paths=base_model_paths, |
185 | | - lora_modules=[], |
186 | | - prompt_adapters=[], |
187 | | - request_logger=request_logger, |
188 | | - ) |
189 | | - |
190 | | - return web_app |
| 108 | + subprocess.Popen(" ".join(cmd), shell=True) |
191 | 109 |
|
192 | 110 |
|
193 | 111 | # ## Deploy the server |
@@ -229,26 +147,3 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)): |
229 | 147 | # ```bash |
230 | 148 | # modal run openai_compatible/load_test.py |
231 | 149 | # ``` |
232 | | - |
233 | | -# ## Addenda |
234 | | - |
235 | | -# The rest of the code in this example is utility code. |
236 | | - |
237 | | - |
238 | | -def get_model_config(engine): |
239 | | - import asyncio |
240 | | - |
241 | | - try: # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1 |
242 | | - event_loop = asyncio.get_running_loop() |
243 | | - except RuntimeError: |
244 | | - event_loop = None |
245 | | - |
246 | | - if event_loop is not None and event_loop.is_running(): |
247 | | - # If the current is instanced by Ray Serve, |
248 | | - # there is already a running event loop |
249 | | - model_config = event_loop.run_until_complete(engine.get_model_config()) |
250 | | - else: |
251 | | - # When using single vLLM without engine_use_ray |
252 | | - model_config = asyncio.run(engine.get_model_config()) |
253 | | - |
254 | | - return model_config |
0 commit comments