Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,11 @@
resolve_chat_template_content_format)


def is_chat_completions_request(inputs: Dict) -> bool:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deleted because it's not used

return "messages" in inputs


def parse_chat_completions_request_vllm(
input_map: Dict,
is_rolling_batch: bool,
rolling_batch,
tokenizer,
chat_template: Optional[str] = None,
configs: Properties = None,
is_mistral_tokenizer: bool = False,
):
Expand All @@ -41,12 +36,6 @@ def parse_chat_completions_request_vllm(
"You must enable rolling batch to use the chat completions format."
)

if not is_mistral_tokenizer and not hasattr(tokenizer,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deleted because the vllm utils do this validation for us already

"apply_chat_template"):
raise AttributeError(
f"Cannot provide chat completion for tokenizer: {tokenizer.__class__}, "
f"please ensure that your tokenizer supports chat templates.")

tool_parser = rolling_batch.get_tool_parser()
chat_params = ChatProperties(**input_map)

Expand Down Expand Up @@ -85,16 +74,15 @@ def parse_chat_completions_request_vllm(
if is_mistral_tokenizer:
text_inputs = apply_mistral_chat_template(
tokenizer,
messages=chat_params.messages,
chat_template=chat_template,
add_generation_prompt=True,
chat_params.messages,
None,
tools=tool_dicts,
)
else:
text_inputs = apply_hf_chat_template(
tokenizer,
conversation=conversation,
chat_template=chat_template,
conversation,
None,
add_generation_prompt=True,
tools=tool_dicts,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class VllmRbProperties(Properties):
# The following configs have different defaults, or additional processing in DJL compared to vLLM
dtype: str = "auto"
max_loras: int = 4
task: str = 'auto'
# The following configs have broken processing in vllm via the FlexibleArgumentParser
long_lora_scaling_factors: Optional[Tuple[float, ...]] = None
use_v2_block_manager: bool = True
Expand All @@ -89,6 +90,14 @@ def validate_engine(cls, engine):
f"Need python engine to start vLLM RollingBatcher")
return engine

@field_validator('task')
def validate_task(cls, task):
# TODO: conflicts between HF and VLLM tasks, need to separate these.
# for backwards compatibility, max text-generation to generate
if task == 'text-generation':
task = 'generate'
return task

@field_validator('dtype')
def validate_dtype(cls, val):
if val not in DTYPE_MAPPER:
Expand All @@ -114,6 +123,7 @@ def validate_tool_call_parser(self):
raise ValueError(
f"Invalid tool call parser: {self.tool_call_parser} "
f"(chose from {{ {','.join(valid_tool_parses)} }})")
return self

@field_validator('override_neuron_config', mode="before")
def validate_override_neuron_config(cls, val):
Expand Down
9 changes: 4 additions & 5 deletions serving/docker/lmi-container-requirements-common.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
peft==0.13.2
peft
protobuf==3.20.3
transformers==4.45.2
transformers>=4.45.2
hf-transfer
zstandard
datasets==3.0.1
Expand All @@ -23,9 +23,8 @@ onnx
sentence_transformers
onnxruntime-gpu==1.20.0
autoawq==0.2.5
llmcompressor==0.3.1
tokenizers==0.20.3
pydantic==2.9.2
tokenizers>=0.20.3
pydantic>=2.9.2
optimum==1.23.2
torch==2.5.1
torchvision==0.20.1
Expand Down
1 change: 1 addition & 0 deletions serving/docker/requirements-lmi.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
-r requirements-common.txt
llmcompressor
# flash infer kernels for vllm/lmi-dist
https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl
# vllm wheel built with pt2.5.1
Expand Down
3 changes: 2 additions & 1 deletion serving/docker/requirements-vllm.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
-r requirements-common.txt
vllm==0.7.0
llmcompressor
vllm==0.7.1
8 changes: 1 addition & 7 deletions serving/docker/scripts/create_virtual_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@ requirements_file=$2
# This was copied over from the previous pip install defined in the lmi.Dockerfile, so it's specific to that Dockerfile
python -m venv --system-site-packages $venv_directory
venv_pip="${venv_directory}/bin/pip"
$venv_pip install -r $requirements_file
$venv_pip install -r $requirements_file || exit 1
$venv_pip install https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl --no-deps
git clone https://github.com/neuralmagic/AutoFP8.git
cd AutoFP8
git reset --hard 4b2092c
$venv_pip install .
cd ..
rm -rf AutoFP8
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we not need FP8 installation?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not anymore! we're using llm compressor now #2701

$venv_pip cache purge
5 changes: 5 additions & 0 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,11 @@ def get_model_name():
"seq_length": [256],
"tokenizer": "TheBloke/Llama-2-7B-Chat-fp16"
},
"mistral-7b": {
"batch_size": [1, 4],
"seq_length": [256],
"tokenizer": "TheBloke/Llama-2-7B-Chat-fp16",
}
}

vllm_tool_model_spec = {
Expand Down
1 change: 1 addition & 0 deletions tests/integration/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,7 @@ def test_mistral_7b(self):
prepare.build_vllm_model("mistral-7b")
r.launch()
client.run("vllm mistral-7b".split())
client.run("vllm_chat mistral-7b".split())

def test_phi2(self):
with Runner('lmi', 'phi-2') as r:
Expand Down
Loading