Skip to content

Commit cd2234a

Browse files
authored
[CI] Modify some CI test cases to run on L4 environment to reduce H100 resource usage. (#1543)
Signed-off-by: yenuo26 <410167048@qq.com> Signed-off-by: wangyu <53896905+yenuo26@users.noreply.github.com>
1 parent b0156d8 commit cd2234a

File tree

13 files changed

+103
-129
lines changed

13 files changed

+103
-129
lines changed

.buildkite/test-amd.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ steps:
8989
- export VLLM_LOGGING_LEVEL=DEBUG
9090
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
9191
- pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
92+
- pytest -s -v tests/engine/test_async_omni_engine_abort.py
9293

9394
- label: "Omni Model Test Qwen3-Omni"
9495
timeout_in_minutes: 15
@@ -102,7 +103,7 @@ steps:
102103
- export VLLM_TEST_CLEAN_GPU_MEMORY="1"
103104
- pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
104105
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
105-
- pytest -s -v tests/engine/test_async_omni_engine_abort.py
106+
106107

107108
- label: "Diffusion Image Edit Test"
108109
timeout_in_minutes: 15

.buildkite/test-merge.yml

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -155,41 +155,31 @@ steps:
155155
volumes:
156156
- "/fsx/hf_cache:/fsx/hf_cache"
157157

158-
- label: "Benchmark & Engine Test with H100"
159-
timeout_in_minutes: 15
158+
- label: "Benchmark & Engine Test"
160159
depends_on: upload-merge-pipeline
161160
commands:
162-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
163-
- pytest -s -v tests/benchmarks/test_serve_cli.py
164-
- pytest -s -v tests/engine/test_async_omni_engine_abort.py
161+
- |
162+
timeout 15m bash -c '
163+
export VLLM_WORKER_MULTIPROC_METHOD=spawn
164+
set +e
165+
pytest -s -v tests/benchmarks/test_serve_cli.py
166+
EXIT1=$$?
167+
pytest -s -v tests/engine/test_async_omni_engine_abort.py
168+
EXIT2=$$?
169+
exit $$((EXIT1 | EXIT2))
170+
'
165171
agents:
166-
queue: "mithril-h100-pool"
172+
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
167173
plugins:
168-
- kubernetes:
169-
podSpec:
170-
containers:
171-
- image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
172-
resources:
173-
limits:
174-
nvidia.com/gpu: 2
175-
volumeMounts:
176-
- name: devshm
177-
mountPath: /dev/shm
178-
- name: hf-cache
179-
mountPath: /root/.cache/huggingface
180-
env:
181-
- name: HF_HOME
182-
value: /root/.cache/huggingface
183-
nodeSelector:
184-
node.kubernetes.io/instance-type: gpu-h100-sxm
185-
volumes:
186-
- name: devshm
187-
emptyDir:
188-
medium: Memory
189-
- name: hf-cache
190-
hostPath:
191-
path: /mnt/hf-cache
192-
type: DirectoryOrCreate
174+
- docker#v5.2.0:
175+
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
176+
always-pull: true
177+
propagate-environment: true
178+
shm-size: "8gb"
179+
environment:
180+
- "HF_HOME=/fsx/hf_cache"
181+
volumes:
182+
- "/fsx/hf_cache:/fsx/hf_cache"
193183

194184
- label: "Omni Model Test"
195185
timeout_in_minutes: 15

.buildkite/test-nightly.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,13 @@ steps:
55
if: build.env("NIGHTLY") == "1"
66
commands:
77
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
8-
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
9-
- pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
8+
- |
9+
set +e
10+
pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
11+
EXIT1=$$?
12+
pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
13+
EXIT2=$$?
14+
exit $$((EXIT1 | EXIT2))
1015
agents:
1116
queue: "mithril-h100-pool"
1217
plugins:

.buildkite/test-ready.yml

Lines changed: 32 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,12 @@ steps:
3636
commands:
3737
- |
3838
timeout 20m bash -c '
39-
pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py &&
39+
set +e
40+
pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
41+
EXIT1=$$?
4042
pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
43+
EXIT2=$$?
44+
exit $$((EXIT1 | EXIT2))
4145
'
4246
agents:
4347
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
@@ -117,43 +121,33 @@ steps:
117121
volumes:
118122
- "/fsx/hf_cache:/fsx/hf_cache"
119123

120-
# - label: "Benchmark & Engine Test with H100"
121-
# depends_on: upload-ready-pipeline
122-
# commands:
123-
# - |
124-
# timeout 15m bash -c '
125-
# export VLLM_WORKER_MULTIPROC_METHOD=spawn
126-
# pytest -s -v tests/benchmarks/test_serve_cli.py
127-
# pytest -s -v tests/engine/test_async_omni_engine_abort.py
128-
# '
129-
# agents:
130-
# queue: "mithril-h100-pool"
131-
# plugins:
132-
# - kubernetes:
133-
# podSpec:
134-
# containers:
135-
# - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
136-
# resources:
137-
# limits:
138-
# nvidia.com/gpu: 2
139-
# volumeMounts:
140-
# - name: devshm
141-
# mountPath: /dev/shm
142-
# - name: hf-cache
143-
# mountPath: /root/.cache/huggingface
144-
# env:
145-
# - name: HF_HOME
146-
# value: /root/.cache/huggingface
147-
# nodeSelector:
148-
# node.kubernetes.io/instance-type: gpu-h100-sxm
149-
# volumes:
150-
# - name: devshm
151-
# emptyDir:
152-
# medium: Memory
153-
# - name: hf-cache
154-
# hostPath:
155-
# path: /mnt/hf-cache
156-
# type: DirectoryOrCreate
124+
125+
- label: "Benchmark & Engine Test"
126+
depends_on: upload-ready-pipeline
127+
commands:
128+
- |
129+
timeout 15m bash -c '
130+
export VLLM_WORKER_MULTIPROC_METHOD=spawn
131+
set +e
132+
pytest -s -v tests/benchmarks/test_serve_cli.py
133+
EXIT1=$$?
134+
pytest -s -v tests/engine/test_async_omni_engine_abort.py
135+
EXIT2=$$?
136+
exit $$((EXIT1 | EXIT2))
137+
'
138+
agents:
139+
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
140+
plugins:
141+
- docker#v5.2.0:
142+
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
143+
always-pull: true
144+
propagate-environment: true
145+
shm-size: "8gb"
146+
environment:
147+
- "HF_HOME=/fsx/hf_cache"
148+
volumes:
149+
- "/fsx/hf_cache:/fsx/hf_cache"
150+
157151

158152
- label: "Omni Model Test"
159153
depends_on: upload-ready-pipeline

tests/benchmarks/test_serve_cli.py

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,36 +3,18 @@
33

44
import pytest
55

6-
from tests.conftest import OmniServer
76
from tests.utils import hardware_test
87

9-
models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
10-
stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")]
8+
models = ["Qwen/Qwen2.5-Omni-7B"]
9+
stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_ci.yaml")]
1110

1211
# Create parameter combinations for model and stage config
1312
test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
1413

1514

16-
@pytest.fixture(scope="module")
17-
def omni_server(request):
18-
"""Start vLLM-Omni server as a subprocess with actual model weights.
19-
Uses session scope so the server starts only once for the entire test session.
20-
Multi-stage initialization can take 10-20+ minutes.
21-
"""
22-
model, stage_config_path = request.param
23-
24-
print(f"Starting OmniServer with model: {model}")
25-
print("This may take 10-20+ minutes for initialization...")
26-
27-
with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server:
28-
print("OmniServer started successfully")
29-
yield server
30-
print("OmniServer stopped")
31-
32-
3315
@pytest.mark.core_model
3416
@pytest.mark.benchmark
35-
@hardware_test(res={"cuda": "H100"}, num_cards=2)
17+
@hardware_test(res={"cuda": "L4"}, num_cards=3)
3618
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
3719
def test_bench_serve_chat(omni_server):
3820
command = [

tests/e2e/online_serving/test_qwen3_omni_expansion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def get_system_prompt():
7171

7272
def get_prompt(prompt_type="text_only"):
7373
prompts = {
74-
"text_only": "What is the capital of China?",
74+
"text_only": "What is the capital of China? Answer in 20 words.",
7575
"mix": "What is recited in the audio? What is in this image? What is in this video?",
7676
"text_video": "What is in this video? ",
7777
"text_image": "What is in this image? ",

tests/e2e/stage_configs/qwen2_5_omni_ci.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ stage_args:
2222
trust_remote_code: true
2323
engine_output_type: latent
2424
enable_prefix_caching: false
25+
mm_processor_cache_gb: 0
2526
is_comprehension: true
2627
final_output: true
2728
final_output_type: text
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
stage_args:
2+
- stage_id: 0
3+
runtime:
4+
process: true # Run this stage in a separate process
5+
devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
6+
max_batch_size: 1
7+
engine_args:
8+
model_stage: thinker
9+
model_arch: Qwen2_5OmniForConditionalGeneration
10+
worker_type: ar
11+
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
12+
max_model_len: 16384
13+
max_num_batched_tokens: 16384
14+
max_num_seqs: 1
15+
gpu_memory_utilization: 0.9
16+
skip_mm_profiling: true
17+
enforce_eager: true # Now we only support eager mode
18+
trust_remote_code: true
19+
engine_output_type: latent
20+
enable_prefix_caching: false
21+
is_comprehension: true
22+
final_output: true
23+
final_output_type: text
24+
default_sampling_params:
25+
temperature: 0.0
26+
top_p: 1.0
27+
top_k: -1
28+
max_tokens: 128
29+
seed: 42
30+
detokenize: True
31+
repetition_penalty: 1.1

tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml

Lines changed: 0 additions & 31 deletions
This file was deleted.

tests/engine/test_async_omni_engine_abort.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515

1616
SEED = 42
1717

18-
stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_thinker_ci.yaml")
19-
model = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
18+
stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_thinker_ci.yaml")
19+
model = "Qwen/Qwen2.5-Omni-7B"
2020

2121

2222
async def generate(
@@ -60,7 +60,7 @@ async def generate(
6060

6161
@pytest.mark.core_model
6262
@pytest.mark.omni
63-
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
63+
@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1)
6464
@pytest.mark.asyncio
6565
async def test_abort():
6666
with ExitStack() as after:

0 commit comments

Comments
 (0)