Skip to content

Commit e68e3b5

Browse files
committed
[ci] Separate vllm sync and async integration tests
1 parent 89403fe commit e68e3b5

File tree

2 files changed

+31
-23
lines changed

2 files changed

+31
-23
lines changed

.github/workflows/integration.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,10 @@ jobs:
154154
# - test: TestTrtLlmHandler2
155155
# instance: g6
156156
# failure-prefix: trtllm
157-
- test: TestVllm1
157+
- test: TestVllm
158+
instance: g6
159+
failure-prefix: lmi
160+
- test: TestVllmAsync
158161
instance: g6
159162
failure-prefix: lmi
160163
- test: TestVllmCustomHandlers

tests/integration/tests.py

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -545,28 +545,14 @@ def test_llama3_8b(self):
545545

546546
@pytest.mark.vllm
547547
@pytest.mark.gpu_4
548-
class TestVllm1:
548+
class TestVllm:
549549

550550
def test_gemma_2b(self):
551551
with Runner('lmi', 'gemma-2b') as r:
552552
prepare.build_vllm_model("gemma-2b")
553553
r.launch()
554554
client.run("vllm gemma-2b".split())
555555

556-
def test_llama2_7b_chat(self):
557-
with Runner('lmi', 'llama2-7b-chat') as r:
558-
prepare.build_vllm_async_model("llama2-7b-chat")
559-
r.launch()
560-
client.run("vllm_chat llama2-7b-chat".split())
561-
562-
@pytest.mark.skipif(not is_applicable_cuda_capability(89),
563-
reason="Unsupported CUDA capability")
564-
def test_qwen2_7b_fp8(self):
565-
with Runner('lmi', 'qwen2-7b-fp8') as r:
566-
prepare.build_vllm_async_model("qwen2-7b-fp8")
567-
r.launch()
568-
client.run("vllm qwen2-7b-fp8".split())
569-
570556
def test_llama3_8b_chunked_prefill(self):
571557
with Runner('lmi', 'llama3-8b-chunked-prefill') as r:
572558
prepare.build_vllm_model("llama3-8b-chunked-prefill")
@@ -576,7 +562,7 @@ def test_llama3_8b_chunked_prefill(self):
576562

577563
def test_falcon_11b_chunked_prefill(self):
578564
with Runner('lmi', 'falcon-11b-chunked-prefill') as r:
579-
prepare.build_vllm_async_model("falcon-11b-chunked-prefill")
565+
prepare.build_vllm_model("falcon-11b-chunked-prefill")
580566
r.launch()
581567
client.run(
582568
"vllm falcon-11b-chunked-prefill --in_tokens 1200".split())
@@ -587,6 +573,31 @@ def test_llama_68m_speculative_medusa(self):
587573
r.launch()
588574
client.run("vllm llama-68m-speculative-medusa".split())
589575

576+
def test_vllm_performance(self):
577+
with Runner('lmi', 'handler-performance-vllm') as r:
578+
prepare.build_handler_performance_model("tiny-llama-vllm")
579+
r.launch("CUDA_VISIBLE_DEVICES=0")
580+
client.run("handler_performance vllm".split())
581+
582+
583+
@pytest.mark.vllm
584+
@pytest.mark.gpu_4
585+
class TestVllmAsync:
586+
587+
def test_llama2_7b_chat(self):
588+
with Runner('lmi', 'llama2-7b-chat') as r:
589+
prepare.build_vllm_async_model("llama2-7b-chat")
590+
r.launch()
591+
client.run("vllm_chat llama2-7b-chat".split())
592+
593+
@pytest.mark.skipif(not is_applicable_cuda_capability(89),
594+
reason="Unsupported CUDA capability")
595+
def test_qwen2_7b_fp8(self):
596+
with Runner('lmi', 'qwen2-7b-fp8') as r:
597+
prepare.build_vllm_async_model("qwen2-7b-fp8")
598+
r.launch()
599+
client.run("vllm qwen2-7b-fp8".split())
600+
590601
def test_llama_68m_speculative_eagle(self):
591602
with Runner('lmi', 'llama-68m-speculative-eagle') as r:
592603
prepare.build_vllm_async_model("llama-68m-speculative-eagle")
@@ -624,12 +635,6 @@ def test_tiny_llama_input_length_exceeded(self):
624635
client.run(
625636
"vllm tinyllama-input-len-exceeded --in_tokens 10".split())
626637

627-
def test_vllm_performance(self):
628-
with Runner('lmi', 'handler-performance-vllm') as r:
629-
prepare.build_handler_performance_model("tiny-llama-vllm")
630-
r.launch("CUDA_VISIBLE_DEVICES=0")
631-
client.run("handler_performance vllm".split())
632-
633638

634639
@pytest.mark.vllm
635640
@pytest.mark.lora

0 commit comments

Comments
 (0)