Skip to content

Commit d9dda20

Browse files
ksuma2109Suma Kasa
andauthored
Upgrade vllm to 0.15.0 with DeepSeek and GPT OSS Eagle Regression Fix (#2995)
Co-authored-by: Suma Kasa <sumakasa@amazon.com>
1 parent 39ab90b commit d9dda20

File tree

6 files changed

+27
-51
lines changed

6 files changed

+27
-51
lines changed

engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414

1515
from pydantic import Field
1616
from vllm import TokensPrompt
17-
from vllm.entrypoints.openai.serving_engine import RequestPrompt, TextTokensPrompt
17+
from vllm.entrypoints.openai.protocol import RequestPrompt, TextTokensPrompt
1818
from vllm.tool_parsers import ToolParser
1919
from vllm.tokenizers.mistral import maybe_serialize_tool_calls
2020
from vllm.transformers_utils.tokenizer import AnyTokenizer
21-
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
21+
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
2222
from vllm.entrypoints.chat_utils import (
2323
apply_hf_chat_template, apply_mistral_chat_template, parse_chat_messages,
2424
resolve_chat_template_content_format, ChatCompletionMessageParam,

engines/python/setup/djl_python/lmi_vllm/request_response_utils.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,16 @@
1212
# the specific language governing permissions and limitations under the License.
1313
import json
1414
from typing import Callable, Tuple, Union, List, Dict
15-
from vllm.entrypoints.openai.protocol import (
15+
from vllm.entrypoints.openai.completion.protocol import (
1616
CompletionRequest,
17-
ChatCompletionRequest,
1817
CompletionResponse,
19-
ChatCompletionResponse,
20-
ErrorResponse,
2118
CompletionLogProbs,
2219
)
20+
from vllm.entrypoints.openai.chat_completion.protocol import (
21+
ChatCompletionRequest,
22+
ChatCompletionResponse,
23+
)
24+
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
2325
from vllm.logprobs import Logprob
2426
from vllm.transformers_utils.tokenizer import AnyTokenizer
2527

engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@
1616
from typing import Optional, Union, AsyncGenerator
1717

1818
from vllm import AsyncLLMEngine
19-
from vllm.entrypoints.openai.protocol import (
20-
ChatCompletionRequest,
21-
CompletionRequest,
22-
)
23-
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
24-
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
25-
from vllm.entrypoints.openai.serving_models import OpenAIServingModels, BaseModelPath
19+
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
20+
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
21+
22+
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
23+
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
24+
from vllm.entrypoints.openai.models.protocol import BaseModelPath
25+
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
2626
from vllm.utils.counter import AtomicCounter
2727
from vllm.utils.system_utils import kill_process_tree
2828

@@ -123,7 +123,7 @@ async def initialize(self, properties: dict):
123123
async_engine=True)
124124
self.vllm_engine = AsyncLLMEngine.from_engine_args(
125125
self.vllm_engine_args)
126-
self.tokenizer = await self.vllm_engine.get_tokenizer()
126+
self.tokenizer = self.vllm_engine.get_tokenizer()
127127

128128
model_names = self.vllm_engine_args.served_model_name or "lmi"
129129
if not isinstance(model_names, list):
Lines changed: 9 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,12 @@
11
torch==2.9.1
2-
autoawq
3-
torchvision
4-
peft==0.15.1
5-
protobuf>=6.30.0
6-
transformers==4.57.1
2+
transformers >= 4.56.0, < 5
3+
huggingface-hub
74
hf-transfer
8-
zstandard
9-
datasets==3.0.1
10-
mpi4py
11-
sentencepiece
12-
tiktoken
13-
blobfile
14-
einops
15-
accelerate==1.0.1
16-
bitsandbytes
17-
pandas
18-
pyarrow
19-
jinja2
20-
retrying
21-
opencv-contrib-python-headless
22-
safetensors
23-
scipy==1.16.0
24-
onnx==1.19.0
25-
sentence_transformers
26-
onnxruntime-gpu==1.20.0
27-
autoawq
28-
tokenizers
29-
pydantic>=2.12.0
5+
peft==0.15.1
6+
sentence-transformers==3.3.1
307
optimum==1.23.2
31-
uvloop
32-
ninja
33-
peft
34-
llmcompressor
35-
vllm==0.14.0
36-
xgrammar
37-
flashinfer-python==0.5.3
38-
lmcache
8+
llmcompressor==0.9.0.1
9+
mpi4py==4.0.1
10+
https://djl-ai.s3.us-east-1.amazonaws.com/publish/vllm/vllm-0.15.1.dev2%2Bgb225806e5.cu128-cp312-cp312-linux_x86_64.whl
11+
lmcache
12+
autoawq

serving/docker/lmi.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ RUN scripts/patch_oss_dlc.sh python \
9090

9191
COPY lmi-container-requirements.txt ./requirements.txt
9292
RUN pip3 install --upgrade pip setuptools
93-
RUN pip3 install torch==2.9.1 torchvision \
93+
RUN pip3 install torch==2.9.1 torchvision\
9494
&& pip3 install -r requirements.txt \
9595
&& pip3 install ${djl_converter_wheel} --no-deps
9696

tests/integration/tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ class TestVllm1_g6:
370370

371371
def test_gemma_2b(self):
372372
with Runner("lmi", "gemma-2b") as r:
373-
prepare.build_vllm_model("gemma-2b")
373+
prepare.build_vllm_async_model("gemma-2b")
374374
r.launch()
375375
client.run("vllm gemma-2b".split())
376376

0 commit comments

Comments
 (0)