Upgrade vllm to 0.12.0 and fix Multimodal integration tests (#2978)

ksuma2109 · Suma Kasa · web-flow · commit d08744cb91a5 · 2025-12-10T11:36:22.000-08:00
Co-authored-by: Suma Kasa &lt;sumakasa@amazon.com&gt;
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -224,8 +224,8 @@ jobs:
           - test: TestVllmAsyncLoraWithCustomCode_g6
             instance: g6
             failure-prefix: lmi
-          - test: TestMultiModalVllm_g6
-            instance: g6
+          - test: TestMultiModalVllm_p4d
+            instance: p4d
             failure-prefix: lmi
           # - test: TestTextEmbedding_g6
           #   instance: g6
diff --git a/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py b/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py
@@ -16,7 +16,7 @@
 from vllm import TokensPrompt
 from vllm.entrypoints.openai.serving_engine import RequestPrompt, TextTokensPrompt
 from vllm.entrypoints.openai.tool_parsers import ToolParser
-from vllm.transformers_utils.tokenizers.mistral import maybe_serialize_tool_calls
+from vllm.tokenizers.mistral import maybe_serialize_tool_calls
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.chat_utils import (
diff --git a/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py b/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py
@@ -166,9 +166,11 @@ def vllm_non_stream_output_formatter(
     **_,
 ) -> Output:
     if isinstance(response, ErrorResponse):
-        return create_non_stream_output("",
-                                        error=response.message,
-                                        code=response.code)
+        error_msg = getattr(response, 'message', None) or getattr(
+            response, 'detail', str(response))
+        error_code = getattr(response, 'code', None) or getattr(
+            response, 'type', 500)
+        return create_non_stream_output("", error=error_msg, code=error_code)
     response_data = response.model_dump_json()
     return create_non_stream_output(response_data)
 
diff --git a/serving/docker/lmi-container-requirements.txt b/serving/docker/lmi-container-requirements.txt
@@ -32,7 +32,7 @@ uvloop
 ninja
 peft
 llmcompressor
-https://djl-ai.s3.us-east-1.amazonaws.com/publish/vllm/vllm_extensions_package-1.0.0-py3-none-any.whl
+vllm==0.12.0
 xgrammar
-flashinfer-python==0.5.2
+flashinfer-python==0.5.3
 lmcache
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -417,6 +417,9 @@
     "llava_v1.6-mistral": {
         "option.model_id": "s3://djl-llm/llava-v1.6-mistral-7b-hf/",
         "option.limit_mm_per_prompt": '{"image": 4}',
+        "option.gpu_memory_utilization": "0.7",
+        "option.enforce_eager": True,
+        "option.tensor_parallel_degree": 1,
     },
     "paligemma-3b-mix-448": {
         "option.model_id": "s3://djl-llm/paligemma-3b-mix-448/",
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
@@ -552,17 +552,19 @@ def test_mistral_7b_fp8(self):
                 "correctness trtllm-mistral-7b-instruct-v0.3-fp8".split())
 
 
-class TestMultiModalVllm_g6:
+@pytest.mark.vllm
+@pytest.mark.gpu_8
+class TestMultiModalVllm_p4d:
 
     def test_llava_next(self):
         with Runner("lmi", "llava_v1.6-mistral") as r:
-            prepare.build_vllm_model("llava_v1.6-mistral")
+            prepare.build_vllm_async_model("llava_v1.6-mistral")
             r.launch()
             client.run("multimodal llava_v1.6-mistral".split())
 
     def test_phi3_v(self):
         with Runner("lmi", "phi-3-vision-128k-instruct") as r:
-            prepare.build_vllm_model("phi-3-vision-128k-instruct")
+            prepare.build_vllm_async_model("phi-3-vision-128k-instruct")
             r.launch()
             client.run("multimodal phi-3-vision-128k-instruct".split())