[ci] Separate vllm async tests and remove async model options

xyang16 · xyang16 · commit 0944ce3cfefa · 2025-10-29T23:33:15.000-07:00
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -157,6 +157,9 @@ jobs:
           - test: TestVllm1
             instance: g6
             failure-prefix: lmi
+          - test: TestVllm2
+            instance: g6
+            failure-prefix: lmi
           - test: TestVllmCustomHandlers
             instance: g6
             failure-prefix: lmi
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -1507,12 +1507,9 @@
 handler_performance_model_list = {
     "tiny-llama-vllm": {
         "engine": "Python",
-        "option.rolling_batch": "disable",
-        "option.async_mode": True,
         "option.model_id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
         "option.gpu_memory_utilization": "0.9",
         "option.max_rolling_batch_size": 512,
-        "option.entryPoint": "djl_python.lmi_vllm.vllm_async_service",
     },
     "tiny-llama-trtllm": {
         "engine": "Python",
@@ -1717,9 +1714,6 @@ def build_vllm_async_model_with_custom_handler(model, handler_type="success"):
         )
     options = vllm_model_list[model]
     options["engine"] = "Python"
-    options["option.rolling_batch"] = "disable"
-    options["option.async_mode"] = "true"
-    options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"
     write_model_artifacts(options)
 
     # Copy custom handler from examples
@@ -1736,9 +1730,6 @@ def build_vllm_async_model_custom_formatters(model, error_type=None):
         )
     options = vllm_model_list[model]
     options["engine"] = "Python"
-    options["option.rolling_batch"] = "disable"
-    options["option.async_mode"] = "true"
-    options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"
     write_model_artifacts(options)
 
     # Create custom formatter files based on error_type
@@ -1883,9 +1874,6 @@ def build_stateful_model(model):
         )
     options = stateful_model_list[model]
     options["engine"] = "Python"
-    options["option.rolling_batch"] = "disable"
-    options["option.async_mode"] = "true"
-    options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"
     options["option.enable_stateful_sessions"] = "true"
     options["option.sessions_path"] = "/tmp/djl_sessions"
     write_model_artifacts(options)
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
@@ -555,7 +555,7 @@ def test_gemma_2b(self):
 
     def test_llama2_7b_chat(self):
         with Runner('lmi', 'llama2-7b-chat') as r:
-            prepare.build_vllm_async_model("llama2-7b-chat")
+            prepare.build_vllm_model("llama2-7b-chat")
             r.launch()
             client.run("vllm_chat llama2-7b-chat".split())
 
@@ -587,6 +587,11 @@ def test_llama_68m_speculative_medusa(self):
             r.launch()
             client.run("vllm llama-68m-speculative-medusa".split())
 
+
+@pytest.mark.vllm
+@pytest.mark.gpu_4
+class TestVllm2:
+
     def test_llama_68m_speculative_eagle(self):
         with Runner('lmi', 'llama-68m-speculative-eagle') as r:
             prepare.build_vllm_async_model("llama-68m-speculative-eagle")