Skip to content

Commit dc662fd

Browse files
authored
Simplify and revise vLLM example (#1076)
1 parent 1c72ae8 commit dc662fd

File tree

1 file changed

+41
-146
lines changed

1 file changed

+41
-146
lines changed

06_gpu_and_ml/llm-serving/vllm_inference.py

Lines changed: 41 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@
1111
# and it is supported by open source LLM serving frameworks like [vLLM](https://docs.vllm.ai/en/latest/).
1212

1313
# In this example, we show how to run a vLLM server in OpenAI-compatible mode on Modal.
14-
# You can find a video walkthrough of this example on our YouTube channel [here](https://www.youtube.com/watch?v=QmY_7ePR1hM).
15-
16-
# Note that the vLLM server is a FastAPI app, which can be configured and extended just like any other.
17-
# Here, we use it to add simple authentication middleware, following the
18-
# [implementation in the vLLM repository](https://github.com/vllm-project/vllm/blob/v0.5.3post1/vllm/entrypoints/openai/api_server.py).
1914

2015
# Our examples repository also includes scripts for running clients and load-testing for OpenAI-compatible APIs
2116
# [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/llm-serving/openai_compatible).
@@ -31,8 +26,15 @@
3126

3227
import modal
3328

34-
vllm_image = modal.Image.debian_slim(python_version="3.12").pip_install(
35-
"vllm==0.6.3post1", "fastapi[standard]==0.115.4"
29+
vllm_image = (
30+
modal.Image.debian_slim(python_version="3.12")
31+
.pip_install(
32+
"vllm==0.7.2",
33+
"huggingface_hub[hf_transfer]==0.26.2",
34+
"flashinfer-python==0.2.0.post2", # pinning, very unstable
35+
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
36+
)
37+
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "VLLM_USE_V1": "1"})
3638
)
3739

3840
# ## Download the model weights
@@ -48,146 +50,62 @@
4850
MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
4951
MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
5052

51-
# We need to make the weights of that model available to our Modal Functions.
52-
53-
# So to follow along with this example, you'll need to download those weights
54-
# onto a Modal Volume by running another script from the
55-
# [examples repository](https://github.com/modal-labs/modal-examples).
56-
57-
try:
58-
volume = modal.Volume.from_name("llamas", create_if_missing=False).hydrate()
59-
except modal.exception.NotFoundError:
60-
raise Exception("Download models first with modal run download_llama.py")
53+
# Although vLLM will download weights on-demand, we want to cache them if possible. We'll use [Modal Volumes](https://modal.com/docs/guide/volumes)
54+
# as a cache, which act as a "shared disk" that all Functions can access.
55+
hf_cache_vol = modal.Volume.from_name(
56+
"huggingface-cache", create_if_missing=True
57+
)
58+
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
6159

6260

6361
# ## Build a vLLM engine and serve it
6462

65-
# vLLM's OpenAI-compatible server is exposed as a [FastAPI](https://fastapi.tiangolo.com/) router.
66-
67-
# FastAPI is a Python web framework that implements the [ASGI standard](https://en.wikipedia.org/wiki/Asynchronous_Server_Gateway_Interface),
68-
# much like [Flask](https://en.wikipedia.org/wiki/Flask_(web_framework)) is a Python web framework
69-
# that implements the [WSGI standard](https://en.wikipedia.org/wiki/Web_Server_Gateway_Interface).
70-
71-
# Modal offers [first-class support for ASGI (and WSGI) apps](https://modal.com/docs/guide/webhooks). We just need to decorate a function that returns the app
72-
# with `@modal.asgi_app()` (or `@modal.wsgi_app()`) and then add it to the Modal app with the `app.function` decorator.
73-
74-
# The function below first imports the FastAPI router from the vLLM library, then adds authentication compatible with OpenAI client libraries. You might also add more routes here.
75-
76-
# Then, the function creates an `AsyncLLMEngine`, the core of the vLLM server. It's responsible for loading the model, running inference, and serving responses.
63+
# We start a vLLM server as a subprocess, which Modal has [first-class support for](https://modal.com/docs/reference/modal.web_server). We configure Modal
64+
# to forward requests to port 8000.
7765

78-
# After attaching that engine to the FastAPI app via the `api_server` module of the vLLM library, we return the FastAPI app
79-
# so it can be served on Modal.
66+
# The function below spawns a vLLM instance listening at port 8000, serving requests to our model. vLLM will authenticate requests
67+
# using the API key we provide it.
8068

8169
app = modal.App("example-vllm-openai-compatible")
8270

8371
N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
84-
TOKEN = "super-secret-token" # auth token. for production use, replace with a modal.Secret
72+
API_KEY = "super-secret-key" # api key, for auth. for production use, replace with a modal.Secret
8573

8674
MINUTES = 60 # seconds
8775
HOURS = 60 * MINUTES
8876

77+
VLLM_PORT = 8000
78+
8979

9080
@app.function(
9181
image=vllm_image,
9282
gpu=f"H100:{N_GPU}",
9383
container_idle_timeout=5 * MINUTES,
9484
timeout=24 * HOURS,
9585
allow_concurrent_inputs=1000,
96-
volumes={MODELS_DIR: volume},
86+
volumes={
87+
"/root/.cache/huggingface": hf_cache_vol,
88+
"/root/.cache/vllm": vllm_cache_vol,
89+
},
9790
)
98-
@modal.asgi_app()
91+
@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
9992
def serve():
100-
import fastapi
101-
import vllm.entrypoints.openai.api_server as api_server
102-
from vllm.engine.arg_utils import AsyncEngineArgs
103-
from vllm.engine.async_llm_engine import AsyncLLMEngine
104-
from vllm.entrypoints.logger import RequestLogger
105-
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
106-
from vllm.entrypoints.openai.serving_completion import (
107-
OpenAIServingCompletion,
108-
)
109-
from vllm.entrypoints.openai.serving_engine import BaseModelPath
110-
from vllm.usage.usage_lib import UsageContext
111-
112-
volume.reload() # ensure we have the latest version of the weights
113-
114-
# create a fastAPI app that uses vLLM's OpenAI-compatible router
115-
web_app = fastapi.FastAPI(
116-
title=f"OpenAI-compatible {MODEL_NAME} server",
117-
description="Run an OpenAI-compatible LLM server with vLLM on modal.com 🚀",
118-
version="0.0.1",
119-
docs_url="/docs",
120-
)
121-
122-
# security: CORS middleware for external requests
123-
http_bearer = fastapi.security.HTTPBearer(
124-
scheme_name="Bearer Token",
125-
description="See code for authentication details.",
126-
)
127-
web_app.add_middleware(
128-
fastapi.middleware.cors.CORSMiddleware,
129-
allow_origins=["*"],
130-
allow_credentials=True,
131-
allow_methods=["*"],
132-
allow_headers=["*"],
133-
)
134-
135-
# security: inject dependency on authed routes
136-
async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
137-
if api_key.credentials != TOKEN:
138-
raise fastapi.HTTPException(
139-
status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
140-
detail="Invalid authentication credentials",
141-
)
142-
return {"username": "authenticated_user"}
143-
144-
router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])
145-
146-
# wrap vllm's router in auth router
147-
router.include_router(api_server.router)
148-
# add authed vllm to our fastAPI app
149-
web_app.include_router(router)
150-
151-
engine_args = AsyncEngineArgs(
152-
model=MODELS_DIR + "/" + MODEL_NAME,
153-
tensor_parallel_size=N_GPU,
154-
gpu_memory_utilization=0.90,
155-
max_model_len=8096,
156-
enforce_eager=False, # capture the graph for faster inference, but slower cold starts (30s > 20s)
157-
)
158-
159-
engine = AsyncLLMEngine.from_engine_args(
160-
engine_args, usage_context=UsageContext.OPENAI_API_SERVER
161-
)
162-
163-
model_config = get_model_config(engine)
164-
165-
request_logger = RequestLogger(max_log_len=2048)
166-
167-
base_model_paths = [
168-
BaseModelPath(name=MODEL_NAME.split("/")[1], model_path=MODEL_NAME)
93+
import subprocess
94+
95+
cmd = [
96+
"vllm",
97+
"serve",
98+
"--uvicorn-log-level=info",
99+
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
100+
"--host",
101+
"0.0.0.0",
102+
"--port",
103+
str(VLLM_PORT),
104+
"--api-key",
105+
API_KEY,
169106
]
170107

171-
api_server.chat = lambda s: OpenAIServingChat(
172-
engine,
173-
model_config=model_config,
174-
base_model_paths=base_model_paths,
175-
chat_template=None,
176-
response_role="assistant",
177-
lora_modules=[],
178-
prompt_adapters=[],
179-
request_logger=request_logger,
180-
)
181-
api_server.completion = lambda s: OpenAIServingCompletion(
182-
engine,
183-
model_config=model_config,
184-
base_model_paths=base_model_paths,
185-
lora_modules=[],
186-
prompt_adapters=[],
187-
request_logger=request_logger,
188-
)
189-
190-
return web_app
108+
subprocess.Popen(" ".join(cmd), shell=True)
191109

192110

193111
# ## Deploy the server
@@ -229,26 +147,3 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
229147
# ```bash
230148
# modal run openai_compatible/load_test.py
231149
# ```
232-
233-
# ## Addenda
234-
235-
# The rest of the code in this example is utility code.
236-
237-
238-
def get_model_config(engine):
239-
import asyncio
240-
241-
try: # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
242-
event_loop = asyncio.get_running_loop()
243-
except RuntimeError:
244-
event_loop = None
245-
246-
if event_loop is not None and event_loop.is_running():
247-
# If the current is instanced by Ray Serve,
248-
# there is already a running event loop
249-
model_config = event_loop.run_until_complete(engine.get_model_config())
250-
else:
251-
# When using single vLLM without engine_use_ray
252-
model_config = asyncio.run(engine.get_model_config())
253-
254-
return model_config

0 commit comments

Comments
 (0)