Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 41 additions & 146 deletions 06_gpu_and_ml/llm-serving/vllm_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@
# and it is supported by open source LLM serving frameworks like [vLLM](https://docs.vllm.ai/en/latest/).

# In this example, we show how to run a vLLM server in OpenAI-compatible mode on Modal.
# You can find a video walkthrough of this example on our YouTube channel [here](https://www.youtube.com/watch?v=QmY_7ePR1hM).

# Note that the vLLM server is a FastAPI app, which can be configured and extended just like any other.
# Here, we use it to add simple authentication middleware, following the
# [implementation in the vLLM repository](https://github.com/vllm-project/vllm/blob/v0.5.3post1/vllm/entrypoints/openai/api_server.py).

# Our examples repository also includes scripts for running clients and load-testing for OpenAI-compatible APIs
# [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/llm-serving/openai_compatible).
Expand All @@ -31,8 +26,15 @@

import modal

vllm_image = modal.Image.debian_slim(python_version="3.12").pip_install(
"vllm==0.6.3post1", "fastapi[standard]==0.115.4"
vllm_image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
"vllm==0.7.2",
"huggingface_hub[hf_transfer]==0.26.2",
"flashinfer-python==0.2.0.post2", # pinning, very unstable
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "VLLM_USE_V1": "1"})
)

# ## Download the model weights
Expand All @@ -48,146 +50,62 @@
MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"

# We need to make the weights of that model available to our Modal Functions.

# So to follow along with this example, you'll need to download those weights
# onto a Modal Volume by running another script from the
# [examples repository](https://github.com/modal-labs/modal-examples).

try:
volume = modal.Volume.from_name("llamas", create_if_missing=False).hydrate()
except modal.exception.NotFoundError:
raise Exception("Download models first with modal run download_llama.py")
# Although vLLM will download weights on-demand, we want to cache them if possible. We'll use [Modal Volumes](https://modal.com/docs/guide/volumes)
# as a cache, which act as a "shared disk" that all Functions can access.
hf_cache_vol = modal.Volume.from_name(
"huggingface-cache", create_if_missing=True
)
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)


# ## Build a vLLM engine and serve it

# vLLM's OpenAI-compatible server is exposed as a [FastAPI](https://fastapi.tiangolo.com/) router.

# FastAPI is a Python web framework that implements the [ASGI standard](https://en.wikipedia.org/wiki/Asynchronous_Server_Gateway_Interface),
# much like [Flask](https://en.wikipedia.org/wiki/Flask_(web_framework)) is a Python web framework
# that implements the [WSGI standard](https://en.wikipedia.org/wiki/Web_Server_Gateway_Interface).

# Modal offers [first-class support for ASGI (and WSGI) apps](https://modal.com/docs/guide/webhooks). We just need to decorate a function that returns the app
# with `@modal.asgi_app()` (or `@modal.wsgi_app()`) and then add it to the Modal app with the `app.function` decorator.

# The function below first imports the FastAPI router from the vLLM library, then adds authentication compatible with OpenAI client libraries. You might also add more routes here.

# Then, the function creates an `AsyncLLMEngine`, the core of the vLLM server. It's responsible for loading the model, running inference, and serving responses.
# We start a vLLM server as a subprocess, which Modal has [first-class support for](https://modal.com/docs/reference/modal.web_server). We configure Modal
# to forward requests to port 8000.

# After attaching that engine to the FastAPI app via the `api_server` module of the vLLM library, we return the FastAPI app
# so it can be served on Modal.
# The function below spawns a vLLM instance listening at port 8000, serving requests to our model. vLLM will authenticate requests
# using the API key we provide it.

app = modal.App("example-vllm-openai-compatible")

N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
TOKEN = "super-secret-token" # auth token. for production use, replace with a modal.Secret
API_KEY = "super-secret-key" # api key, for auth. for production use, replace with a modal.Secret

MINUTES = 60 # seconds
HOURS = 60 * MINUTES

VLLM_PORT = 8000


@app.function(
image=vllm_image,
gpu=f"H100:{N_GPU}",
container_idle_timeout=5 * MINUTES,
timeout=24 * HOURS,
allow_concurrent_inputs=1000,
volumes={MODELS_DIR: volume},
volumes={
"/root/.cache/huggingface": hf_cache_vol,
"/root/.cache/vllm": vllm_cache_vol,
},
)
@modal.asgi_app()
@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
def serve():
import fastapi
import vllm.entrypoints.openai.api_server as api_server
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import (
OpenAIServingCompletion,
)
from vllm.entrypoints.openai.serving_engine import BaseModelPath
from vllm.usage.usage_lib import UsageContext

volume.reload() # ensure we have the latest version of the weights

# create a fastAPI app that uses vLLM's OpenAI-compatible router
web_app = fastapi.FastAPI(
title=f"OpenAI-compatible {MODEL_NAME} server",
description="Run an OpenAI-compatible LLM server with vLLM on modal.com 🚀",
version="0.0.1",
docs_url="/docs",
)

# security: CORS middleware for external requests
http_bearer = fastapi.security.HTTPBearer(
scheme_name="Bearer Token",
description="See code for authentication details.",
)
web_app.add_middleware(
fastapi.middleware.cors.CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

# security: inject dependency on authed routes
async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
if api_key.credentials != TOKEN:
raise fastapi.HTTPException(
status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
detail="Invalid authentication credentials",
)
return {"username": "authenticated_user"}

router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])

# wrap vllm's router in auth router
router.include_router(api_server.router)
# add authed vllm to our fastAPI app
web_app.include_router(router)

engine_args = AsyncEngineArgs(
model=MODELS_DIR + "/" + MODEL_NAME,
tensor_parallel_size=N_GPU,
gpu_memory_utilization=0.90,
max_model_len=8096,
enforce_eager=False, # capture the graph for faster inference, but slower cold starts (30s > 20s)
)

engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER
)

model_config = get_model_config(engine)

request_logger = RequestLogger(max_log_len=2048)

base_model_paths = [
BaseModelPath(name=MODEL_NAME.split("/")[1], model_path=MODEL_NAME)
import subprocess

cmd = [
"vllm",
"serve",
"--uvicorn-log-level=info",
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
"--host",
"0.0.0.0",
"--port",
str(VLLM_PORT),
"--api-key",
API_KEY,
]

api_server.chat = lambda s: OpenAIServingChat(
engine,
model_config=model_config,
base_model_paths=base_model_paths,
chat_template=None,
response_role="assistant",
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
)
api_server.completion = lambda s: OpenAIServingCompletion(
engine,
model_config=model_config,
base_model_paths=base_model_paths,
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
)

return web_app
subprocess.Popen(" ".join(cmd), shell=True)


# ## Deploy the server
Expand Down Expand Up @@ -229,26 +147,3 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
# ```bash
# modal run openai_compatible/load_test.py
# ```

# ## Addenda

# The rest of the code in this example is utility code.


def get_model_config(engine):
import asyncio

try: # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
event_loop = asyncio.get_running_loop()
except RuntimeError:
event_loop = None

if event_loop is not None and event_loop.is_running():
# If the current is instanced by Ray Serve,
# there is already a running event loop
model_config = event_loop.run_until_complete(engine.get_model_config())
else:
# When using single vLLM without engine_use_ray
model_config = asyncio.run(engine.get_model_config())

return model_config