deepjavalibrary · siddvenk · Feb 3, 2025 · Feb 2, 2025 · siddvenk · Feb 3, 2025
@@ -21,16 +21,11 @@
                                          resolve_chat_template_content_format)
 
 
-def is_chat_completions_request(inputs: Dict) -> bool:
-    return "messages" in inputs
-
-
 def parse_chat_completions_request_vllm(
     input_map: Dict,
     is_rolling_batch: bool,
     rolling_batch,
     tokenizer,
-    chat_template: Optional[str] = None,
     configs: Properties = None,
     is_mistral_tokenizer: bool = False,
 ):
@@ -41,12 +36,6 @@ def parse_chat_completions_request_vllm(
             "You must enable rolling batch to use the chat completions format."
         )
 
-    if not is_mistral_tokenizer and not hasattr(tokenizer,
-                                                "apply_chat_template"):
-        raise AttributeError(
-            f"Cannot provide chat completion for tokenizer: {tokenizer.__class__}, "
-            f"please ensure that your tokenizer supports chat templates.")
-
     tool_parser = rolling_batch.get_tool_parser()
     chat_params = ChatProperties(**input_map)
 
@@ -85,16 +74,15 @@ def parse_chat_completions_request_vllm(
     if is_mistral_tokenizer:
         text_inputs = apply_mistral_chat_template(
             tokenizer,
-            messages=chat_params.messages,
-            chat_template=chat_template,
-            add_generation_prompt=True,
+            chat_params.messages,
+            None,
             tools=tool_dicts,
         )
     else:
         text_inputs = apply_hf_chat_template(
             tokenizer,
-            conversation=conversation,
-            chat_template=chat_template,
+            conversation,
+            None,
             add_generation_prompt=True,
             tools=tool_dicts,
         )

@@ -66,6 +66,7 @@ class VllmRbProperties(Properties):
     # The following configs have different defaults, or additional processing in DJL compared to vLLM
     dtype: str = "auto"
     max_loras: int = 4
+    task: str = 'auto'
     # The following configs have broken processing in vllm via the FlexibleArgumentParser
     long_lora_scaling_factors: Optional[Tuple[float, ...]] = None
     use_v2_block_manager: bool = True
@@ -89,6 +90,14 @@ def validate_engine(cls, engine):
                 f"Need python engine to start vLLM RollingBatcher")
         return engine
 
+    @field_validator('task')
+    def validate_task(cls, task):
+        # TODO: conflicts between HF and VLLM tasks, need to separate these.
+        # for backwards compatibility, max text-generation to generate
+        if task == 'text-generation':
+            task = 'generate'
+        return task
+
     @field_validator('dtype')
     def validate_dtype(cls, val):
         if val not in DTYPE_MAPPER:
@@ -114,6 +123,7 @@ def validate_tool_call_parser(self):
                 raise ValueError(
                     f"Invalid tool call parser: {self.tool_call_parser} "
                     f"(chose from {{ {','.join(valid_tool_parses)} }})")
+        return self
 
     @field_validator('override_neuron_config', mode="before")
     def validate_override_neuron_config(cls, val):

@@ -1,6 +1,6 @@
-peft==0.13.2
+peft
 protobuf==3.20.3
-transformers==4.45.2
+transformers>=4.45.2
 hf-transfer
 zstandard
 datasets==3.0.1
@@ -23,9 +23,8 @@ onnx
 sentence_transformers
 onnxruntime-gpu==1.20.0
 autoawq==0.2.5
-llmcompressor==0.3.1
-tokenizers==0.20.3
-pydantic==2.9.2
+tokenizers>=0.20.3
+pydantic>=2.9.2
 optimum==1.23.2
 torch==2.5.1
 torchvision==0.20.1

@@ -1,4 +1,5 @@
 -r requirements-common.txt
+llmcompressor
 # flash infer kernels for vllm/lmi-dist
 https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl
 # vllm wheel built with pt2.5.1

@@ -1,2 +1,3 @@
 -r requirements-common.txt
-vllm==0.7.0
+llmcompressor
+vllm==0.7.1
@@ -7,12 +7,6 @@ requirements_file=$2
 # This was copied over from the previous pip install defined in the lmi.Dockerfile, so it's specific to that Dockerfile
 python -m venv --system-site-packages $venv_directory
 venv_pip="${venv_directory}/bin/pip"
-$venv_pip install -r $requirements_file
+$venv_pip install -r $requirements_file || exit 1
 $venv_pip install https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl --no-deps
-git clone https://github.com/neuralmagic/AutoFP8.git
-cd AutoFP8
-git reset --hard 4b2092c
-$venv_pip install .
-cd ..
-rm -rf AutoFP8
 $venv_pip cache purge
@@ -602,6 +602,11 @@ def get_model_name():
         "seq_length": [256],
         "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16"
     },
+    "mistral-7b": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16",
+    }
 }
 
 vllm_tool_model_spec = {

@@ -571,6 +571,7 @@ def test_mistral_7b(self):
             prepare.build_vllm_model("mistral-7b")
             r.launch()
             client.run("vllm mistral-7b".split())
+            client.run("vllm_chat mistral-7b".split())
 
     def test_phi2(self):
         with Runner('lmi', 'phi-2') as r: