modal-labs · charlesfrye · Feb 20, 2025 · Feb 12, 2025
diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py
@@ -11,11 +11,6 @@
 # and it is supported by open source LLM serving frameworks like [vLLM](https://docs.vllm.ai/en/latest/).
 
 # In this example, we show how to run a vLLM server in OpenAI-compatible mode on Modal.
-# You can find a video walkthrough of this example on our YouTube channel [here](https://www.youtube.com/watch?v=QmY_7ePR1hM).
-
-# Note that the vLLM server is a FastAPI app, which can be configured and extended just like any other.
-# Here, we use it to add simple authentication middleware, following the
-# [implementation in the vLLM repository](https://github.com/vllm-project/vllm/blob/v0.5.3post1/vllm/entrypoints/openai/api_server.py).
 
 # Our examples repository also includes scripts for running clients and load-testing for OpenAI-compatible APIs
 # [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/llm-serving/openai_compatible).
@@ -31,8 +26,15 @@
 
 import modal
 
-vllm_image = modal.Image.debian_slim(python_version="3.12").pip_install(
-    "vllm==0.6.3post1", "fastapi[standard]==0.115.4"
+vllm_image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .pip_install(
+        "vllm==0.7.2",
+        "huggingface_hub[hf_transfer]==0.26.2",
+        "flashinfer-python==0.2.0.post2",  # pinning, very unstable
+        extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "VLLM_USE_V1": "1"})
 )
 
 # ## Download the model weights
@@ -48,146 +50,62 @@
 MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
 MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
 
-# We need to make the weights of that model available to our Modal Functions.
-
-# So to follow along with this example, you'll need to download those weights
-# onto a Modal Volume by running another script from the
-# [examples repository](https://github.com/modal-labs/modal-examples).
-
-try:
-    volume = modal.Volume.from_name("llamas", create_if_missing=False).hydrate()
-except modal.exception.NotFoundError:
-    raise Exception("Download models first with modal run download_llama.py")
+# Although vLLM will download weights on-demand, we want to cache them if possible. We'll use [Modal Volumes](https://modal.com/docs/guide/volumes)
+# as a cache, which act as a "shared disk" that all Functions can access.
+hf_cache_vol = modal.Volume.from_name(
+    "huggingface-cache", create_if_missing=True
+)
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
 
 
 # ## Build a vLLM engine and serve it
 
-# vLLM's OpenAI-compatible server is exposed as a [FastAPI](https://fastapi.tiangolo.com/) router.
-
-# FastAPI is a Python web framework that implements the [ASGI standard](https://en.wikipedia.org/wiki/Asynchronous_Server_Gateway_Interface),
-# much like [Flask](https://en.wikipedia.org/wiki/Flask_(web_framework)) is a Python web framework
-# that implements the [WSGI standard](https://en.wikipedia.org/wiki/Web_Server_Gateway_Interface).
-
-# Modal offers [first-class support for ASGI (and WSGI) apps](https://modal.com/docs/guide/webhooks). We just need to decorate a function that returns the app
-# with `@modal.asgi_app()` (or `@modal.wsgi_app()`) and then add it to the Modal app with the `app.function` decorator.
-
-# The function below first imports the FastAPI router from the vLLM library, then adds authentication compatible with OpenAI client libraries. You might also add more routes here.
-
-# Then, the function creates an `AsyncLLMEngine`, the core of the vLLM server. It's responsible for loading the model, running inference, and serving responses.
+# We start a vLLM server as a subprocess, which Modal has [first-class support for](https://modal.com/docs/reference/modal.web_server). We configure Modal
+# to forward requests to port 8000.
 
-# After attaching that engine to the FastAPI app via the `api_server` module of the vLLM library, we return the FastAPI app
-# so it can be served on Modal.
+# The function below spawns a vLLM instance listening at port 8000, serving requests to our model. vLLM will authenticate requests
+# using the API key we provide it.
 
 app = modal.App("example-vllm-openai-compatible")
 
 N_GPU = 1  # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
-TOKEN = "super-secret-token"  # auth token. for production use, replace with a modal.Secret
+API_KEY = "super-secret-key"  # api key, for auth. for production use, replace with a modal.Secret
 
 MINUTES = 60  # seconds
 HOURS = 60 * MINUTES
 
+VLLM_PORT = 8000
+
 
 @app.function(
     image=vllm_image,
     gpu=f"H100:{N_GPU}",
     container_idle_timeout=5 * MINUTES,
     timeout=24 * HOURS,
     allow_concurrent_inputs=1000,
-    volumes={MODELS_DIR: volume},
+    volumes={
+        "/root/.cache/huggingface": hf_cache_vol,
+        "/root/.cache/vllm": vllm_cache_vol,
+    },
 )
-@modal.asgi_app()
+@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
 def serve():
-    import fastapi
-    import vllm.entrypoints.openai.api_server as api_server
-    from vllm.engine.arg_utils import AsyncEngineArgs
-    from vllm.engine.async_llm_engine import AsyncLLMEngine
-    from vllm.entrypoints.logger import RequestLogger
-    from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-    from vllm.entrypoints.openai.serving_completion import (
-        OpenAIServingCompletion,
-    )
-    from vllm.entrypoints.openai.serving_engine import BaseModelPath
-    from vllm.usage.usage_lib import UsageContext
-
-    volume.reload()  # ensure we have the latest version of the weights
-
-    # create a fastAPI app that uses vLLM's OpenAI-compatible router
-    web_app = fastapi.FastAPI(
-        title=f"OpenAI-compatible {MODEL_NAME} server",
-        description="Run an OpenAI-compatible LLM server with vLLM on modal.com 🚀",
-        version="0.0.1",
-        docs_url="/docs",
-    )
-
-    # security: CORS middleware for external requests
-    http_bearer = fastapi.security.HTTPBearer(
-        scheme_name="Bearer Token",
-        description="See code for authentication details.",
-    )
-    web_app.add_middleware(
-        fastapi.middleware.cors.CORSMiddleware,
-        allow_origins=["*"],
-        allow_credentials=True,
-        allow_methods=["*"],
-        allow_headers=["*"],
-    )
-
-    # security: inject dependency on authed routes
-    async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
-        if api_key.credentials != TOKEN:
-            raise fastapi.HTTPException(
-                status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
-                detail="Invalid authentication credentials",
-            )
-        return {"username": "authenticated_user"}
-
-    router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])
-
-    # wrap vllm's router in auth router
-    router.include_router(api_server.router)
-    # add authed vllm to our fastAPI app
-    web_app.include_router(router)
-
-    engine_args = AsyncEngineArgs(
-        model=MODELS_DIR + "/" + MODEL_NAME,
-        tensor_parallel_size=N_GPU,
-        gpu_memory_utilization=0.90,
-        max_model_len=8096,
-        enforce_eager=False,  # capture the graph for faster inference, but slower cold starts (30s > 20s)
-    )
-
-    engine = AsyncLLMEngine.from_engine_args(
-        engine_args, usage_context=UsageContext.OPENAI_API_SERVER
-    )
-
-    model_config = get_model_config(engine)
-
-    request_logger = RequestLogger(max_log_len=2048)
-
-    base_model_paths = [
-        BaseModelPath(name=MODEL_NAME.split("/")[1], model_path=MODEL_NAME)
+    import subprocess
+
+    cmd = [
+        "vllm",
+        "serve",
+        "--uvicorn-log-level=info",
+        "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
+        "--host",
+        "0.0.0.0",
+        "--port",
+        str(VLLM_PORT),
+        "--api-key",
+        API_KEY,
     ]
 
-    api_server.chat = lambda s: OpenAIServingChat(
-        engine,
-        model_config=model_config,
-        base_model_paths=base_model_paths,
-        chat_template=None,
-        response_role="assistant",
-        lora_modules=[],
-        prompt_adapters=[],
-        request_logger=request_logger,
-    )
-    api_server.completion = lambda s: OpenAIServingCompletion(
-        engine,
-        model_config=model_config,
-        base_model_paths=base_model_paths,
-        lora_modules=[],
-        prompt_adapters=[],
-        request_logger=request_logger,
-    )
-
-    return web_app
+    subprocess.Popen(" ".join(cmd), shell=True)
 
 
 # ## Deploy the server
@@ -229,26 +147,3 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
 # ```bash
 # modal run openai_compatible/load_test.py
 # ```
-
-# ## Addenda
-
-# The rest of the code in this example is utility code.
-
-
-def get_model_config(engine):
-    import asyncio
-
-    try:  # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
-        event_loop = asyncio.get_running_loop()
-    except RuntimeError:
-        event_loop = None
-
-    if event_loop is not None and event_loop.is_running():
-        # If the current is instanced by Ray Serve,
-        # there is already a running event loop
-        model_config = event_loop.run_until_complete(engine.get_model_config())
-    else:
-        # When using single vLLM without engine_use_ray
-        model_config = asyncio.run(engine.get_model_config())
-
-    return model_config