Skip to content

Commit 3aa4a73

Browse files
authored
Vllm batch upgrade (#712)
* updates to support vllm 0.10.2 * bump to 0.10.2 * cleanup
1 parent 55ff1da commit 3aa4a73

File tree

4 files changed

+15
-12
lines changed

4 files changed

+15
-12
lines changed

model-engine/model_engine_server/inference/vllm/Dockerfile.vllm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# syntax=docker/dockerfile:1
2-
ARG VLLM_VERSION=0.10.1.1
2+
ARG VLLM_VERSION=0.10.2
33
ARG VLLM_BASE_REPO=vllm/vllm-openai
44
ARG VLLM_BASE_IMAGE=${VLLM_BASE_REPO}:v${VLLM_VERSION}
55
FROM ${VLLM_BASE_IMAGE} AS base

model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ fi
2929
ACCOUNT=$1
3030
IMAGE_TAG=$2
3131
BUILD_TARGET=$3
32-
VLLM_VERSION=${VLLM_VERSION:-"0.10.1.1"}
32+
VLLM_VERSION=${VLLM_VERSION:-"0.10.2"}
3333
VLLM_BASE_REPO=${VLLM_BASE_REPO:-"vllm/vllm-openai"}
3434

3535
# if build target = vllm use vllm otherwise use vllm_batch
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
vllm==0.10.1.1
1+
vllm==0.10.2

model-engine/model_engine_server/inference/vllm/vllm_batch.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest, ErrorResponse
5454
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
5555
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
56-
from vllm.entrypoints.openai.serving_engine import BaseModelPath
56+
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
5757
from vllm.utils import merge_async_iterators
5858

5959
CONFIG_FILE = os.getenv("CONFIG_FILE")
@@ -202,7 +202,7 @@ def determine_max_concurrent_requests(
202202
# anecdotally, we're seeing the engine able to handle around 7req/s (for outlines), so set to 30 * 7 ~= 200
203203
if any(
204204
request.to_sampling_params(
205-
default_max_tokens=1, logits_processor_pattern=None
205+
max_tokens=1, logits_processor_pattern=None, default_sampling_params={}
206206
).guided_decoding
207207
for request in requests
208208
):
@@ -294,7 +294,6 @@ async def init_engine(
294294
os.environ.get("NUM_INSTANCES", 1)
295295
), # TODO maybe do something other than TP=8, PP=number of nodes
296296
seed=request.model_cfg.seed or 0,
297-
disable_log_requests=True,
298297
gpu_memory_utilization=request.max_gpu_memory_utilization or 0.9,
299298
)
300299
default_engine_args_dict.update(engine_args_dict)
@@ -304,15 +303,21 @@ async def init_engine(
304303
engine_client = AsyncLLMEngine.from_engine_args(engine_args)
305304
model_config = await engine_client.get_model_config()
306305
resolved_chat_template = load_chat_template(parsed_configs.chat_template)
306+
307307
base_model_paths = [BaseModelPath(name=served_model_name, model_path=model_id)]
308308

309+
openai_serving_models = OpenAIServingModels(
310+
engine_client=engine_client,
311+
model_config=model_config,
312+
base_model_paths=base_model_paths,
313+
)
314+
await openai_serving_models.init_static_loras()
315+
309316
openai_serving_chat = OpenAIServingChat(
310317
engine_client,
311318
model_config,
312-
base_model_paths,
319+
openai_serving_models,
313320
response_role=request.model_cfg.response_role or "assistant",
314-
lora_modules=None,
315-
prompt_adapters=None,
316321
request_logger=None,
317322
chat_template=resolved_chat_template,
318323
chat_template_content_format=None,
@@ -321,9 +326,7 @@ async def init_engine(
321326
openai_serving_completion = OpenAIServingCompletion(
322327
engine_client,
323328
model_config,
324-
base_model_paths,
325-
lora_modules=None,
326-
prompt_adapters=None,
329+
openai_serving_models,
327330
request_logger=None,
328331
)
329332

0 commit comments

Comments
 (0)