[ci] Separate vllm sync and async integration tests

xyang16 · xyang16 · commit e68e3b50d99e · 2025-10-29T22:23:16.000-07:00
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -154,7 +154,10 @@ jobs:
           # - test: TestTrtLlmHandler2
           #   instance: g6
           #   failure-prefix: trtllm
-          - test: TestVllm1
+          - test: TestVllm
+            instance: g6
+            failure-prefix: lmi
+          - test: TestVllmAsync
             instance: g6
             failure-prefix: lmi
           - test: TestVllmCustomHandlers
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
@@ -545,28 +545,14 @@ def test_llama3_8b(self):
 
 @pytest.mark.vllm
 @pytest.mark.gpu_4
-class TestVllm1:
+class TestVllm:
 
     def test_gemma_2b(self):
         with Runner('lmi', 'gemma-2b') as r:
             prepare.build_vllm_model("gemma-2b")
             r.launch()
             client.run("vllm gemma-2b".split())
 
-    def test_llama2_7b_chat(self):
-        with Runner('lmi', 'llama2-7b-chat') as r:
-            prepare.build_vllm_async_model("llama2-7b-chat")
-            r.launch()
-            client.run("vllm_chat llama2-7b-chat".split())
-
-    @pytest.mark.skipif(not is_applicable_cuda_capability(89),
-                        reason="Unsupported CUDA capability")
-    def test_qwen2_7b_fp8(self):
-        with Runner('lmi', 'qwen2-7b-fp8') as r:
-            prepare.build_vllm_async_model("qwen2-7b-fp8")
-            r.launch()
-            client.run("vllm qwen2-7b-fp8".split())
-
     def test_llama3_8b_chunked_prefill(self):
         with Runner('lmi', 'llama3-8b-chunked-prefill') as r:
             prepare.build_vllm_model("llama3-8b-chunked-prefill")
@@ -576,7 +562,7 @@ def test_llama3_8b_chunked_prefill(self):
 
     def test_falcon_11b_chunked_prefill(self):
         with Runner('lmi', 'falcon-11b-chunked-prefill') as r:
-            prepare.build_vllm_async_model("falcon-11b-chunked-prefill")
+            prepare.build_vllm_model("falcon-11b-chunked-prefill")
             r.launch()
             client.run(
                 "vllm falcon-11b-chunked-prefill --in_tokens 1200".split())
@@ -587,6 +573,31 @@ def test_llama_68m_speculative_medusa(self):
             r.launch()
             client.run("vllm llama-68m-speculative-medusa".split())
 
+    def test_vllm_performance(self):
+        with Runner('lmi', 'handler-performance-vllm') as r:
+            prepare.build_handler_performance_model("tiny-llama-vllm")
+            r.launch("CUDA_VISIBLE_DEVICES=0")
+            client.run("handler_performance vllm".split())
+
+
+@pytest.mark.vllm
+@pytest.mark.gpu_4
+class TestVllmAsync:
+
+    def test_llama2_7b_chat(self):
+        with Runner('lmi', 'llama2-7b-chat') as r:
+            prepare.build_vllm_async_model("llama2-7b-chat")
+            r.launch()
+            client.run("vllm_chat llama2-7b-chat".split())
+
+    @pytest.mark.skipif(not is_applicable_cuda_capability(89),
+                        reason="Unsupported CUDA capability")
+    def test_qwen2_7b_fp8(self):
+        with Runner('lmi', 'qwen2-7b-fp8') as r:
+            prepare.build_vllm_async_model("qwen2-7b-fp8")
+            r.launch()
+            client.run("vllm qwen2-7b-fp8".split())
+
     def test_llama_68m_speculative_eagle(self):
         with Runner('lmi', 'llama-68m-speculative-eagle') as r:
             prepare.build_vllm_async_model("llama-68m-speculative-eagle")
@@ -624,12 +635,6 @@ def test_tiny_llama_input_length_exceeded(self):
             client.run(
                 "vllm tinyllama-input-len-exceeded --in_tokens 10".split())
 
-    def test_vllm_performance(self):
-        with Runner('lmi', 'handler-performance-vllm') as r:
-            prepare.build_handler_performance_model("tiny-llama-vllm")
-            r.launch("CUDA_VISIBLE_DEVICES=0")
-            client.run("handler_performance vllm".split())
-
 
 @pytest.mark.vllm
 @pytest.mark.lora