Skip to content

Commit 16cc16a

Browse files
committed
fix lmi/vllm virtual envs, update to vllm 0.7.1
1 parent b5e4ee9 commit 16cc16a

File tree

8 files changed

+28
-29
lines changed

8 files changed

+28
-29
lines changed

engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,11 @@
2121
resolve_chat_template_content_format)
2222

2323

24-
def is_chat_completions_request(inputs: Dict) -> bool:
25-
return "messages" in inputs
26-
27-
2824
def parse_chat_completions_request_vllm(
2925
input_map: Dict,
3026
is_rolling_batch: bool,
3127
rolling_batch,
3228
tokenizer,
33-
chat_template: Optional[str] = None,
3429
configs: Properties = None,
3530
is_mistral_tokenizer: bool = False,
3631
):
@@ -41,12 +36,6 @@ def parse_chat_completions_request_vllm(
4136
"You must enable rolling batch to use the chat completions format."
4237
)
4338

44-
if not is_mistral_tokenizer and not hasattr(tokenizer,
45-
"apply_chat_template"):
46-
raise AttributeError(
47-
f"Cannot provide chat completion for tokenizer: {tokenizer.__class__}, "
48-
f"please ensure that your tokenizer supports chat templates.")
49-
5039
tool_parser = rolling_batch.get_tool_parser()
5140
chat_params = ChatProperties(**input_map)
5241

@@ -85,16 +74,15 @@ def parse_chat_completions_request_vllm(
8574
if is_mistral_tokenizer:
8675
text_inputs = apply_mistral_chat_template(
8776
tokenizer,
88-
messages=chat_params.messages,
89-
chat_template=chat_template,
90-
add_generation_prompt=True,
77+
chat_params.messages,
78+
None,
9179
tools=tool_dicts,
9280
)
9381
else:
9482
text_inputs = apply_hf_chat_template(
9583
tokenizer,
96-
conversation=conversation,
97-
chat_template=chat_template,
84+
conversation,
85+
None,
9886
add_generation_prompt=True,
9987
tools=tool_dicts,
10088
)

engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class VllmRbProperties(Properties):
6666
# The following configs have different defaults, or additional processing in DJL compared to vLLM
6767
dtype: str = "auto"
6868
max_loras: int = 4
69+
task: str = 'auto'
6970
# The following configs have broken processing in vllm via the FlexibleArgumentParser
7071
long_lora_scaling_factors: Optional[Tuple[float, ...]] = None
7172
use_v2_block_manager: bool = True
@@ -89,6 +90,14 @@ def validate_engine(cls, engine):
8990
f"Need python engine to start vLLM RollingBatcher")
9091
return engine
9192

93+
@field_validator('task')
94+
def validate_task(cls, task):
95+
# TODO: conflicts between HF and VLLM tasks, need to separate these.
96+
# for backwards compatibility, max text-generation to generate
97+
if task == 'text-generation':
98+
task = 'generate'
99+
return task
100+
92101
@field_validator('dtype')
93102
def validate_dtype(cls, val):
94103
if val not in DTYPE_MAPPER:
@@ -114,6 +123,7 @@ def validate_tool_call_parser(self):
114123
raise ValueError(
115124
f"Invalid tool call parser: {self.tool_call_parser} "
116125
f"(chose from {{ {','.join(valid_tool_parses)} }})")
126+
return self
117127

118128
@field_validator('override_neuron_config', mode="before")
119129
def validate_override_neuron_config(cls, val):

serving/docker/lmi-container-requirements-common.txt

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
peft==0.13.2
1+
peft
22
protobuf==3.20.3
3-
transformers==4.45.2
3+
transformers>=4.45.2
44
hf-transfer
55
zstandard
66
datasets==3.0.1
@@ -23,9 +23,8 @@ onnx
2323
sentence_transformers
2424
onnxruntime-gpu==1.20.0
2525
autoawq==0.2.5
26-
llmcompressor==0.3.1
27-
tokenizers==0.20.3
28-
pydantic==2.9.2
26+
tokenizers>=0.20.3
27+
pydantic>=2.9.2
2928
optimum==1.23.2
3029
torch==2.5.1
3130
torchvision==0.20.1

serving/docker/requirements-lmi.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
-r requirements-common.txt
2+
llmcompressor
23
# flash infer kernels for vllm/lmi-dist
34
https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl
45
# vllm wheel built with pt2.5.1
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
-r requirements-common.txt
2-
vllm==0.7.0
2+
llmcompressor
3+
vllm==0.7.1

serving/docker/scripts/create_virtual_env.sh

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,6 @@ requirements_file=$2
77
# This was copied over from the previous pip install defined in the lmi.Dockerfile, so it's specific to that Dockerfile
88
python -m venv --system-site-packages $venv_directory
99
venv_pip="${venv_directory}/bin/pip"
10-
$venv_pip install -r $requirements_file
10+
$venv_pip install -r $requirements_file || exit 1
1111
$venv_pip install https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl --no-deps
12-
git clone https://github.com/neuralmagic/AutoFP8.git
13-
cd AutoFP8
14-
git reset --hard 4b2092c
15-
$venv_pip install .
16-
cd ..
17-
rm -rf AutoFP8
1812
$venv_pip cache purge

tests/integration/llm/client.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,11 @@ def get_model_name():
602602
"seq_length": [256],
603603
"tokenizer": "TheBloke/Llama-2-7B-Chat-fp16"
604604
},
605+
"mistral-7b": {
606+
"batch_size": [1, 4],
607+
"seq_length": [256],
608+
"tokenizer": "TheBloke/Llama-2-7B-Chat-fp16",
609+
}
605610
}
606611

607612
vllm_tool_model_spec = {

tests/integration/tests.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,7 @@ def test_mistral_7b(self):
571571
prepare.build_vllm_model("mistral-7b")
572572
r.launch()
573573
client.run("vllm mistral-7b".split())
574+
client.run("vllm_chat mistral-7b".split())
574575

575576
def test_phi2(self):
576577
with Runner('lmi', 'phi-2') as r:

0 commit comments

Comments
 (0)