Skip to content
3 changes: 2 additions & 1 deletion .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ steps:
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
- pytest -s -v tests/engine/test_async_omni_engine_abort.py

- label: "Omni Model Test Qwen3-Omni"
timeout_in_minutes: 15
Expand All @@ -102,7 +103,7 @@ steps:
- export VLLM_TEST_CLEAN_GPU_MEMORY="1"
- pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
- pytest -s -v tests/engine/test_async_omni_engine_abort.py


- label: "Diffusion Image Edit Test"
timeout_in_minutes: 15
Expand Down
52 changes: 21 additions & 31 deletions .buildkite/test-merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,41 +155,31 @@ steps:
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"

- label: "Benchmark & Engine Test with H100"
timeout_in_minutes: 15
- label: "Benchmark & Engine Test"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old config had timeout_in_minutes: 15 at the Buildkite level. The inner timeout 15m only kills the bash process — if the Docker pull or container startup hangs, Buildkite will wait forever. Add timeout_in_minutes back.

depends_on: upload-merge-pipeline
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/benchmarks/test_serve_cli.py
- pytest -s -v tests/engine/test_async_omni_engine_abort.py
- |
timeout 15m bash -c '
export VLLM_WORKER_MULTIPROC_METHOD=spawn
set +e
pytest -s -v tests/benchmarks/test_serve_cli.py
EXIT1=$$?
pytest -s -v tests/engine/test_async_omni_engine_abort.py
EXIT2=$$?
exit $$((EXIT1 | EXIT2))
'
agents:
queue: "mithril-h100-pool"
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- kubernetes:
podSpec:
containers:
- image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 2
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
shm-size: "8gb"
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"

- label: "Omni Model Test"
timeout_in_minutes: 15
Expand Down
9 changes: 7 additions & 2 deletions .buildkite/test-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@ steps:
if: build.env("NIGHTLY") == "1"
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
- pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
- |
set +e
pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
EXIT1=$$?
pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
EXIT2=$$?
exit $$((EXIT1 | EXIT2))
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down
70 changes: 32 additions & 38 deletions .buildkite/test-ready.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,12 @@ steps:
commands:
- |
timeout 20m bash -c '
pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py &&
set +e
pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
EXIT1=$$?
pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
EXIT2=$$?
exit $$((EXIT1 | EXIT2))
'
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
Expand Down Expand Up @@ -117,43 +121,33 @@ steps:
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"

# - label: "Benchmark & Engine Test with H100"
# depends_on: upload-ready-pipeline
# commands:
# - |
# timeout 15m bash -c '
# export VLLM_WORKER_MULTIPROC_METHOD=spawn
# pytest -s -v tests/benchmarks/test_serve_cli.py
# pytest -s -v tests/engine/test_async_omni_engine_abort.py
# '
# agents:
# queue: "mithril-h100-pool"
# plugins:
# - kubernetes:
# podSpec:
# containers:
# - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# resources:
# limits:
# nvidia.com/gpu: 2
# volumeMounts:
# - name: devshm
# mountPath: /dev/shm
# - name: hf-cache
# mountPath: /root/.cache/huggingface
# env:
# - name: HF_HOME
# value: /root/.cache/huggingface
# nodeSelector:
# node.kubernetes.io/instance-type: gpu-h100-sxm
# volumes:
# - name: devshm
# emptyDir:
# medium: Memory
# - name: hf-cache
# hostPath:
# path: /mnt/hf-cache
# type: DirectoryOrCreate

- label: "Benchmark & Engine Test"
depends_on: upload-ready-pipeline
commands:
- |
timeout 15m bash -c '
export VLLM_WORKER_MULTIPROC_METHOD=spawn
set +e
pytest -s -v tests/benchmarks/test_serve_cli.py
EXIT1=$$?
pytest -s -v tests/engine/test_async_omni_engine_abort.py
EXIT2=$$?
exit $$((EXIT1 | EXIT2))
'
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
shm-size: "8gb"
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"


- label: "Omni Model Test"
depends_on: upload-ready-pipeline
Expand Down
24 changes: 3 additions & 21 deletions tests/benchmarks/test_serve_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,18 @@

import pytest

from tests.conftest import OmniServer
from tests.utils import hardware_test

models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")]
models = ["Qwen/Qwen2.5-Omni-7B"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switching from Qwen3-30B to Qwen2.5-7B means benchmark numbers are no longer comparable across runs. If this test is meant to track perf regressions over time, consider keeping a Qwen3 benchmark on H100 (even if less frequent) alongside this L4 one.

stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_ci.yaml")]

# Create parameter combinations for model and stage config
test_params = [(model, stage_config) for model in models for stage_config in stage_configs]


@pytest.fixture(scope="module")
def omni_server(request):
"""Start vLLM-Omni server as a subprocess with actual model weights.
Uses session scope so the server starts only once for the entire test session.
Multi-stage initialization can take 10-20+ minutes.
"""
model, stage_config_path = request.param

print(f"Starting OmniServer with model: {model}")
print("This may take 10-20+ minutes for initialization...")

with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server:
print("OmniServer started successfully")
yield server
print("OmniServer stopped")


@pytest.mark.core_model
@pytest.mark.benchmark
@hardware_test(res={"cuda": "H100"}, num_cards=2)
@hardware_test(res={"cuda": "L4"}, num_cards=3)
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
def test_bench_serve_chat(omni_server):
command = [
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/online_serving/test_qwen3_omni_expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def get_system_prompt():

def get_prompt(prompt_type="text_only"):
prompts = {
"text_only": "What is the capital of China?",
"text_only": "What is the capital of China? Answer in 20 words.",
"mix": "What is recited in the audio? What is in this image? What is in this video?",
"text_video": "What is in this video? ",
"text_image": "What is in this image? ",
Expand Down
1 change: 1 addition & 0 deletions tests/e2e/stage_configs/qwen2_5_omni_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ stage_args:
trust_remote_code: true
engine_output_type: latent
enable_prefix_caching: false
mm_processor_cache_gb: 0
is_comprehension: true
final_output: true
final_output_type: text
Expand Down
31 changes: 31 additions & 0 deletions tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
stage_args:
- stage_id: 0
runtime:
process: true # Run this stage in a separate process
devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len: 16384
max_num_batched_tokens: 16384
max_num_seqs: 1
gpu_memory_utilization: 0.9
skip_mm_profiling: true
enforce_eager: true # Now we only support eager mode
trust_remote_code: true
engine_output_type: latent
enable_prefix_caching: false
is_comprehension: true
final_output: true
final_output_type: text
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.1
31 changes: 0 additions & 31 deletions tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml

This file was deleted.

6 changes: 3 additions & 3 deletions tests/engine/test_async_omni_engine_abort.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

SEED = 42

stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_thinker_ci.yaml")
model = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_thinker_ci.yaml")
model = "Qwen/Qwen2.5-Omni-7B"


async def generate(
Expand Down Expand Up @@ -60,7 +60,7 @@ async def generate(

@pytest.mark.core_model
@pytest.mark.omni
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1)
@pytest.mark.asyncio
async def test_abort():
with ExitStack() as after:
Expand Down
1 change: 0 additions & 1 deletion tests/examples/online_serving/test_qwen2_5_omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,6 @@ def test_modality_control_003(omni_server) -> None:
# TODO: Verify the E2E latency after confirmation baseline.


@pytest.mark.skip(reason="There is a known issue with stream error.")
@pytest.mark.advanced_model
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which fix resolved the stream error? Worth adding a comment or linking the PR in the commit message so this does not get re-skipped later.

@pytest.mark.omni
@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
Expand Down
1 change: 1 addition & 0 deletions vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ stage_args:
engine_output_type: latent
enable_prefix_caching: false
max_num_batched_tokens: 32768
mm_processor_cache_gb: 0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please see #1534 for the reason of the change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw #1534, makes sense for the CI config. But this same change is also added to the production stage configs (qwen2_5_omni.yaml and qwen2_5_omni_multiconnector.yaml) — disables the mm processor cache for all users, not just CI. Was that intentional? If it is only needed to work around an L4 memory constraint, keep it in the CI configs only.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we make accuracy higher priority

is_comprehension: true
final_output: true
final_output_type: text
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ stage_args:
trust_remote_code: true
engine_output_type: latent
enable_prefix_caching: false
mm_processor_cache_gb: 0
is_comprehension: true
final_output: true
final_output_type: text
Expand Down