[ci] Separate vllm async tests and remove async model options

xyang16 · xyang16 · commit 9b50e5286f0a · 2025-10-29T22:37:27.000-07:00
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -154,7 +154,10 @@ jobs:
           # - test: TestTrtLlmHandler2
           #   instance: g6
           #   failure-prefix: trtllm
-          - test: TestVllm1
+          - test: TestVllm
+            instance: g6
+            failure-prefix: lmi
+          - test: TestVllmAsync
             instance: g6
             failure-prefix: lmi
           - test: TestVllmCustomHandlers
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -1507,12 +1507,9 @@
 handler_performance_model_list = {
     "tiny-llama-vllm": {
         "engine": "Python",
-        "option.rolling_batch": "disable",
-        "option.async_mode": True,
         "option.model_id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
         "option.gpu_memory_utilization": "0.9",
         "option.max_rolling_batch_size": 512,
-        "option.entryPoint": "djl_python.lmi_vllm.vllm_async_service",
     },
     "tiny-llama-trtllm": {
         "engine": "Python",
@@ -1717,9 +1714,6 @@ def build_vllm_async_model_with_custom_handler(model, handler_type="success"):
         )
     options = vllm_model_list[model]
     options["engine"] = "Python"
-    options["option.rolling_batch"] = "disable"
-    options["option.async_mode"] = "true"
-    options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"
     write_model_artifacts(options)
 
     # Copy custom handler from examples
@@ -1736,9 +1730,6 @@ def build_vllm_async_model_custom_formatters(model, error_type=None):
         )
     options = vllm_model_list[model]
     options["engine"] = "Python"
-    options["option.rolling_batch"] = "disable"
-    options["option.async_mode"] = "true"
-    options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"
     write_model_artifacts(options)
 
     # Create custom formatter files based on error_type
@@ -1883,9 +1874,6 @@ def build_stateful_model(model):
         )
     options = stateful_model_list[model]
     options["engine"] = "Python"
-    options["option.rolling_batch"] = "disable"
-    options["option.async_mode"] = "true"
-    options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"
     options["option.enable_stateful_sessions"] = "true"
     options["option.sessions_path"] = "/tmp/djl_sessions"
     write_model_artifacts(options)
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
@@ -545,7 +545,7 @@ def test_llama3_8b(self):
 
 @pytest.mark.vllm
 @pytest.mark.gpu_4
-class TestVllm1:
+class TestVllm:
 
     def test_gemma_2b(self):
         with Runner('lmi', 'gemma-2b') as r:
@@ -555,15 +555,15 @@ def test_gemma_2b(self):
 
     def test_llama2_7b_chat(self):
         with Runner('lmi', 'llama2-7b-chat') as r:
-            prepare.build_vllm_async_model("llama2-7b-chat")
+            prepare.build_vllm_model("llama2-7b-chat")
             r.launch()
             client.run("vllm_chat llama2-7b-chat".split())
 
     @pytest.mark.skipif(not is_applicable_cuda_capability(89),
                         reason="Unsupported CUDA capability")
     def test_qwen2_7b_fp8(self):
         with Runner('lmi', 'qwen2-7b-fp8') as r:
-            prepare.build_vllm_async_model("qwen2-7b-fp8")
+            prepare.build_vllm_model("qwen2-7b-fp8")
             r.launch()
             client.run("vllm qwen2-7b-fp8".split())
 
@@ -576,7 +576,7 @@ def test_llama3_8b_chunked_prefill(self):
 
     def test_falcon_11b_chunked_prefill(self):
         with Runner('lmi', 'falcon-11b-chunked-prefill') as r:
-            prepare.build_vllm_async_model("falcon-11b-chunked-prefill")
+            prepare.build_vllm_model("falcon-11b-chunked-prefill")
             r.launch()
             client.run(
                 "vllm falcon-11b-chunked-prefill --in_tokens 1200".split())
@@ -587,6 +587,11 @@ def test_llama_68m_speculative_medusa(self):
             r.launch()
             client.run("vllm llama-68m-speculative-medusa".split())
 
+
+@pytest.mark.vllm
+@pytest.mark.gpu_4
+class TestVllmAsync:
+
     def test_llama_68m_speculative_eagle(self):
         with Runner('lmi', 'llama-68m-speculative-eagle') as r:
             prepare.build_vllm_async_model("llama-68m-speculative-eagle")