Skip to content

Commit 7f8fea9

Browse files
authored
[MLI-4908] Update vllm version (#703)
1 parent 988e31c commit 7f8fea9

File tree

4 files changed

+46
-22
lines changed

4 files changed

+46
-22
lines changed

model-engine/model_engine_server/inference/vllm/Dockerfile.vllm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# syntax=docker/dockerfile:1
2-
ARG VLLM_VERSION=0.6.3
2+
ARG VLLM_VERSION=0.10.0
33
ARG VLLM_BASE_REPO=vllm/vllm-openai
44
ARG VLLM_BASE_IMAGE=${VLLM_BASE_REPO}:v${VLLM_VERSION}
55
FROM ${VLLM_BASE_IMAGE} AS base

model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ set -eo pipefail
44

55
# Build and push vLLM docker image to AWS ECR.
66
#
7-
# Usage: VLLM_VERSION=0.6.6.post1 ./build_and_upload_image.sh <AWS_ACCOUNT_ID> <IMAGE_TAG> vllm|vllm_batch|vllm_batch_v2
7+
# Usage: VLLM_VERSION=0.10.0 ./build_and_upload_image.sh <AWS_ACCOUNT_ID> <IMAGE_TAG> vllm|vllm_batch|vllm_batch_v2
88

99
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
1010
PROJECT_DIR=$SCRIPT_DIR/../../../..
@@ -29,7 +29,7 @@ fi
2929
ACCOUNT=$1
3030
IMAGE_TAG=$2
3131
BUILD_TARGET=$3
32-
VLLM_VERSION=${VLLM_VERSION:-"0.6.6.post1"}
32+
VLLM_VERSION=${VLLM_VERSION:-"0.10.0"}
3333
VLLM_BASE_REPO=${VLLM_BASE_REPO:-"vllm/vllm-openai"}
3434

3535
# if build target = vllm use vllm otherwise use vllm_batch
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
vllm==0.6.6.post1
1+
vllm==0.10.0

model-engine/model_engine_server/inference/vllm/vllm_server.py

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
import json
44
import os
55
import signal
6-
import socket
76
import subprocess
87
import traceback
98
from logging import Logger
109
from typing import AsyncGenerator, Dict, List, Optional
1110

11+
import vllm.envs as envs
1212
from fastapi import APIRouter, BackgroundTasks, Request
1313
from fastapi.responses import Response, StreamingResponse
1414
from vllm.engine.async_llm_engine import (
@@ -17,13 +17,20 @@
1717
)
1818
from vllm.engine.protocol import EngineClient
1919
from vllm.entrypoints.launcher import serve_http
20-
from vllm.entrypoints.openai.api_server import build_app, build_async_engine_client, init_app_state
20+
from vllm.entrypoints.openai.api_server import (
21+
build_app,
22+
build_async_engine_client,
23+
init_app_state,
24+
load_log_config,
25+
maybe_register_tokenizer_info_endpoint,
26+
setup_server,
27+
)
2128
from vllm.entrypoints.openai.cli_args import make_arg_parser
29+
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
2230
from vllm.outputs import CompletionOutput
2331
from vllm.sampling_params import SamplingParams
2432
from vllm.sequence import Logprob
2533
from vllm.utils import FlexibleArgumentParser, random_uuid
26-
from vllm.version import __version__ as VLLM_VERSION
2734

2835
logger = Logger("vllm_server")
2936

@@ -197,34 +204,48 @@ def parse_args(parser: FlexibleArgumentParser):
197204

198205

199206
async def run_server(args, **uvicorn_kwargs) -> None:
200-
logger.info("vLLM API server version %s", VLLM_VERSION)
201-
logger.info("args: %s", args)
207+
"""Run a single-worker API server."""
208+
listen_address, sock = setup_server(args)
209+
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
210+
211+
212+
async def run_server_worker(
213+
listen_address, sock, args, client_config=None, **uvicorn_kwargs
214+
) -> None:
215+
"""Run a single API server worker."""
202216

203-
temp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # nosemgrep
204-
temp_socket.bind(("", args.port))
217+
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
218+
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
205219

206-
def signal_handler(*_) -> None:
207-
# Interrupt server on sigterm while initializing
208-
raise KeyboardInterrupt("terminated")
220+
server_index = client_config.get("client_index", 0) if client_config else 0
209221

210-
signal.signal(signal.SIGTERM, signal_handler)
222+
# Load logging config for uvicorn if specified
223+
log_config = load_log_config(args.log_config_file)
224+
if log_config is not None:
225+
uvicorn_kwargs["log_config"] = log_config
211226

212227
global engine_client
213-
async with build_async_engine_client(args) as engine_client:
214-
app = build_app(args)
215228

216-
model_config = await engine_client.get_model_config()
217-
init_app_state(engine_client, model_config, app.state, args)
229+
async with build_async_engine_client(args, client_config) as engine_client:
230+
maybe_register_tokenizer_info_endpoint(args)
231+
app = build_app(args)
218232

219-
temp_socket.close()
233+
vllm_config = await engine_client.get_vllm_config()
234+
await init_app_state(engine_client, vllm_config, app.state, args)
220235
app.include_router(router)
221236

237+
logger.info("Starting vLLM API server %d on %s", server_index, listen_address)
222238
shutdown_task = await serve_http(
223239
app,
240+
sock=sock,
241+
enable_ssl_refresh=args.enable_ssl_refresh,
224242
host=args.host,
225243
port=args.port,
226244
log_level=args.uvicorn_log_level,
227-
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
245+
# NOTE: When the 'disable_uvicorn_access_log' value is True,
246+
# no access log will be output.
247+
access_log=not args.disable_uvicorn_access_log,
248+
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
228249
ssl_keyfile=args.ssl_keyfile,
229250
ssl_certfile=args.ssl_certfile,
230251
ssl_ca_certs=args.ssl_ca_certs,
@@ -233,7 +254,10 @@ def signal_handler(*_) -> None:
233254
)
234255

235256
# NB: Await server shutdown only after the backend context is exited
236-
await shutdown_task
257+
try:
258+
await shutdown_task
259+
finally:
260+
sock.close()
237261

238262

239263
if __name__ == "__main__":

0 commit comments

Comments
 (0)