Skip to content

Commit 9b50e52

Browse files
committed
[ci] Separate vllm async tests and remove async model options
1 parent 89403fe commit 9b50e52

File tree

3 files changed

+13
-17
lines changed

3 files changed

+13
-17
lines changed

.github/workflows/integration.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,10 @@ jobs:
154154
# - test: TestTrtLlmHandler2
155155
# instance: g6
156156
# failure-prefix: trtllm
157-
- test: TestVllm1
157+
- test: TestVllm
158+
instance: g6
159+
failure-prefix: lmi
160+
- test: TestVllmAsync
158161
instance: g6
159162
failure-prefix: lmi
160163
- test: TestVllmCustomHandlers

tests/integration/llm/prepare.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,12 +1507,9 @@
15071507
handler_performance_model_list = {
15081508
"tiny-llama-vllm": {
15091509
"engine": "Python",
1510-
"option.rolling_batch": "disable",
1511-
"option.async_mode": True,
15121510
"option.model_id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
15131511
"option.gpu_memory_utilization": "0.9",
15141512
"option.max_rolling_batch_size": 512,
1515-
"option.entryPoint": "djl_python.lmi_vllm.vllm_async_service",
15161513
},
15171514
"tiny-llama-trtllm": {
15181515
"engine": "Python",
@@ -1717,9 +1714,6 @@ def build_vllm_async_model_with_custom_handler(model, handler_type="success"):
17171714
)
17181715
options = vllm_model_list[model]
17191716
options["engine"] = "Python"
1720-
options["option.rolling_batch"] = "disable"
1721-
options["option.async_mode"] = "true"
1722-
options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"
17231717
write_model_artifacts(options)
17241718

17251719
# Copy custom handler from examples
@@ -1736,9 +1730,6 @@ def build_vllm_async_model_custom_formatters(model, error_type=None):
17361730
)
17371731
options = vllm_model_list[model]
17381732
options["engine"] = "Python"
1739-
options["option.rolling_batch"] = "disable"
1740-
options["option.async_mode"] = "true"
1741-
options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"
17421733
write_model_artifacts(options)
17431734

17441735
# Create custom formatter files based on error_type
@@ -1883,9 +1874,6 @@ def build_stateful_model(model):
18831874
)
18841875
options = stateful_model_list[model]
18851876
options["engine"] = "Python"
1886-
options["option.rolling_batch"] = "disable"
1887-
options["option.async_mode"] = "true"
1888-
options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"
18891877
options["option.enable_stateful_sessions"] = "true"
18901878
options["option.sessions_path"] = "/tmp/djl_sessions"
18911879
write_model_artifacts(options)

tests/integration/tests.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ def test_llama3_8b(self):
545545

546546
@pytest.mark.vllm
547547
@pytest.mark.gpu_4
548-
class TestVllm1:
548+
class TestVllm:
549549

550550
def test_gemma_2b(self):
551551
with Runner('lmi', 'gemma-2b') as r:
@@ -555,15 +555,15 @@ def test_gemma_2b(self):
555555

556556
def test_llama2_7b_chat(self):
557557
with Runner('lmi', 'llama2-7b-chat') as r:
558-
prepare.build_vllm_async_model("llama2-7b-chat")
558+
prepare.build_vllm_model("llama2-7b-chat")
559559
r.launch()
560560
client.run("vllm_chat llama2-7b-chat".split())
561561

562562
@pytest.mark.skipif(not is_applicable_cuda_capability(89),
563563
reason="Unsupported CUDA capability")
564564
def test_qwen2_7b_fp8(self):
565565
with Runner('lmi', 'qwen2-7b-fp8') as r:
566-
prepare.build_vllm_async_model("qwen2-7b-fp8")
566+
prepare.build_vllm_model("qwen2-7b-fp8")
567567
r.launch()
568568
client.run("vllm qwen2-7b-fp8".split())
569569

@@ -576,7 +576,7 @@ def test_llama3_8b_chunked_prefill(self):
576576

577577
def test_falcon_11b_chunked_prefill(self):
578578
with Runner('lmi', 'falcon-11b-chunked-prefill') as r:
579-
prepare.build_vllm_async_model("falcon-11b-chunked-prefill")
579+
prepare.build_vllm_model("falcon-11b-chunked-prefill")
580580
r.launch()
581581
client.run(
582582
"vllm falcon-11b-chunked-prefill --in_tokens 1200".split())
@@ -587,6 +587,11 @@ def test_llama_68m_speculative_medusa(self):
587587
r.launch()
588588
client.run("vllm llama-68m-speculative-medusa".split())
589589

590+
591+
@pytest.mark.vllm
592+
@pytest.mark.gpu_4
593+
class TestVllmAsync:
594+
590595
def test_llama_68m_speculative_eagle(self):
591596
with Runner('lmi', 'llama-68m-speculative-eagle') as r:
592597
prepare.build_vllm_async_model("llama-68m-speculative-eagle")

0 commit comments

Comments
 (0)