From e031a23cb156a649da81ca9c6cb41d0ede70898a Mon Sep 17 00:00:00 2001 From: yenuo26 <410167048@qq.com> Date: Fri, 27 Feb 2026 16:26:35 +0800 Subject: [PATCH 1/6] [Update] Modify nightly test commands and update model configurations - Updated the nightly test script to handle multiple pytest commands and capture exit statuses. - Changed model from "Qwen/Qwen3-Omni-30B-A3B-Instruct" to "Qwen/Qwen2.5-Omni-7B" in benchmark tests. - Updated stage configuration file for qwen2.5-omni. - Adjusted prompt in the online serving test to specify a word limit for the answer. Signed-off-by: yenuo26 <410167048@qq.com> --- .buildkite/test-nightly.yml | 9 ++++-- tests/benchmarks/test_serve_cli.py | 4 +-- .../test_qwen3_omni_expansion.py | 2 +- .../qwen2_5_omni_thinker_ci.yaml | 31 +++++++++++++++++++ tests/engine/test_async_omni_engine_abort.py | 4 +-- 5 files changed, 43 insertions(+), 7 deletions(-) create mode 100644 tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index b70e73c489..30455468c2 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -5,8 +5,13 @@ steps: if: build.env("NIGHTLY") == "1" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model" - - pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" + - | + set +e + pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model" + EXIT1=$? + pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" + EXIT2=$? + exit $((EXIT1 | EXIT2)) agents: queue: "mithril-h100-pool" plugins: diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 40244eb572..77c7b49338 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -6,8 +6,8 @@ from tests.conftest import OmniServer from tests.utils import hardware_test -models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] -stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")] +models = ["Qwen/Qwen2.5-Omni-7B"] +stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_ci.yaml")] # Create parameter combinations for model and stage config test_params = [(model, stage_config) for model in models for stage_config in stage_configs] diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index c4731ffc7d..e2f77af736 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -71,7 +71,7 @@ def get_system_prompt(): def get_prompt(prompt_type="text_only"): prompts = { - "text_only": "What is the capital of China?", + "text_only": "What is the capital of China? Answer in 20 words.", "mix": "What is recited in the audio? What is in this image? What is in this video?", "text_video": "What is in this video? ", "text_image": "What is in this image? ", diff --git a/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml new file mode 100644 index 0000000000..3ebf914e88 --- /dev/null +++ b/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml @@ -0,0 +1,31 @@ +stage_args: + - stage_id: 0 + runtime: + process: true # Run this stage in a separate process + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + max_batch_size: 1 + engine_args: + model_stage: thinker + model_arch: Qwen2_5OmniForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + max_model_len: 16384 + max_num_batched_tokens: 16384 + max_num_seqs: 1 + gpu_memory_utilization: 0.9 + skip_mm_profiling: true + enforce_eager: true # Now we only support eager mode + trust_remote_code: true + engine_output_type: latent + enable_prefix_caching: false + is_comprehension: true + final_output: true + final_output_type: text + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.1 diff --git a/tests/engine/test_async_omni_engine_abort.py b/tests/engine/test_async_omni_engine_abort.py index b5f9bac991..94544cbbec 100644 --- a/tests/engine/test_async_omni_engine_abort.py +++ b/tests/engine/test_async_omni_engine_abort.py @@ -15,8 +15,8 @@ SEED = 42 -stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_thinker_ci.yaml") -model = "Qwen/Qwen3-Omni-30B-A3B-Instruct" +stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_thinker_ci.yaml") +model = "Qwen/Qwen2.5-Omni-7B" async def generate( From b1ece678f14f5bb256009518b649cbf8a2697055 Mon Sep 17 00:00:00 2001 From: yenuo26 <410167048@qq.com> Date: Fri, 27 Feb 2026 18:32:53 +0800 Subject: [PATCH 2/6] [Refactor] Update Benchmark & Engine Test configuration - Consolidated the Benchmark & Engine Test steps in both test-merge.yml and test-ready.yml. - Changed the agent queue to "gpu_4_queue" and updated the Docker plugin configuration for better resource management. - Removed the deprecated stage configuration file for Qwen3 Omni Thinker. Signed-off-by: yenuo26 <410167048@qq.com> --- .buildkite/test-merge.yml | 52 ++++++-------- .buildkite/test-ready.yml | 70 +++++++++---------- .../stage_configs/qwen3_omni_thinker_ci.yaml | 31 -------- 3 files changed, 53 insertions(+), 100 deletions(-) delete mode 100644 tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 3609d266a2..2e6f883080 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -155,41 +155,31 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Benchmark & Engine Test with H100" - timeout_in_minutes: 15 + - label: "Benchmark & Engine Test" depends_on: upload-merge-pipeline commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/benchmarks/test_serve_cli.py - - pytest -s -v tests/engine/test_async_omni_engine_abort.py + - | + timeout 15m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn + set +e + pytest -s -v tests/benchmarks/test_serve_cli.py + EXIT1=$? + pytest -s -v tests/engine/test_async_omni_engine_abort.py + EXIT2=$? + exit $((EXIT1 | EXIT2)) + ' agents: - queue: "mithril-h100-pool" + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" - label: "Omni Model Test" timeout_in_minutes: 15 diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 3f46336d53..b64a78fcae 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -36,8 +36,12 @@ steps: commands: - | timeout 20m bash -c ' - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py && + set +e + pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py + EXIT1=$? pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py + EXIT2=$? + exit $((EXIT1 | EXIT2)) ' agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU @@ -117,43 +121,33 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - # - label: "Benchmark & Engine Test with H100" - # depends_on: upload-ready-pipeline - # commands: - # - | - # timeout 15m bash -c ' - # export VLLM_WORKER_MULTIPROC_METHOD=spawn - # pytest -s -v tests/benchmarks/test_serve_cli.py - # pytest -s -v tests/engine/test_async_omni_engine_abort.py - # ' - # agents: - # queue: "mithril-h100-pool" - # plugins: - # - kubernetes: - # podSpec: - # containers: - # - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - # resources: - # limits: - # nvidia.com/gpu: 2 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # - name: hf-cache - # mountPath: /root/.cache/huggingface - # env: - # - name: HF_HOME - # value: /root/.cache/huggingface - # nodeSelector: - # node.kubernetes.io/instance-type: gpu-h100-sxm - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - # - name: hf-cache - # hostPath: - # path: /mnt/hf-cache - # type: DirectoryOrCreate + + - label: "Benchmark & Engine Test" + depends_on: upload-ready-pipeline + commands: + - | + timeout 15m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn + set +e + pytest -s -v tests/benchmarks/test_serve_cli.py + EXIT1=$? + pytest -s -v tests/engine/test_async_omni_engine_abort.py + EXIT2=$? + exit $((EXIT1 | EXIT2)) + ' + agents: + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Omni Model Test" depends_on: upload-ready-pipeline diff --git a/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml deleted file mode 100644 index a6b4404d70..0000000000 --- a/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# The following config has been verified on 2x H100-80G GPUs. -stage_args: - - stage_id: 0 - runtime: - devices: "0,1" - max_batch_size: 5 - engine_args: - model_stage: thinker - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - hf_config_name: thinker_config - tensor_parallel_size: 2 - load_format: dummy - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 100 - seed: 42 - detokenize: True - repetition_penalty: 1.05 From 357ea20e99c755c09e0e656762597c3b58c43d13 Mon Sep 17 00:00:00 2001 From: yenuo26 <410167048@qq.com> Date: Fri, 27 Feb 2026 20:56:42 +0800 Subject: [PATCH 3/6] Fix interpolation escaping errors Signed-off-by: yenuo26 <410167048@qq.com> --- .buildkite/test-merge.yml | 6 +++--- .buildkite/test-nightly.yml | 6 +++--- .buildkite/test-ready.yml | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 2e6f883080..5479f8ac1e 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -163,10 +163,10 @@ steps: export VLLM_WORKER_MULTIPROC_METHOD=spawn set +e pytest -s -v tests/benchmarks/test_serve_cli.py - EXIT1=$? + EXIT1=$$? pytest -s -v tests/engine/test_async_omni_engine_abort.py - EXIT2=$? - exit $((EXIT1 | EXIT2)) + EXIT2=$$? + exit $$((EXIT1 | EXIT2)) ' agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 9b33d62d01..7b535033e0 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -8,10 +8,10 @@ steps: - | set +e pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model" - EXIT1=$? + EXIT1=$$? pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" - EXIT2=$? - exit $((EXIT1 | EXIT2)) + EXIT2=$$? + exit $$((EXIT1 | EXIT2)) agents: queue: "mithril-h100-pool" plugins: diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index ae36d72311..6c1b458b89 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -38,10 +38,10 @@ steps: timeout 20m bash -c ' set +e pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py - EXIT1=$? + EXIT1=$$? pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py - EXIT2=$? - exit $((EXIT1 | EXIT2)) + EXIT2=$$? + exit $$((EXIT1 | EXIT2)) ' agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU @@ -130,10 +130,10 @@ steps: export VLLM_WORKER_MULTIPROC_METHOD=spawn set +e pytest -s -v tests/benchmarks/test_serve_cli.py - EXIT1=$? + EXIT1=$$? pytest -s -v tests/engine/test_async_omni_engine_abort.py - EXIT2=$? - exit $((EXIT1 | EXIT2)) + EXIT2=$$? + exit $$((EXIT1 | EXIT2)) ' agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU From 17b0639040fb593f9a130982ee0522a3f0e4ba40 Mon Sep 17 00:00:00 2001 From: yenuo26 <410167048@qq.com> Date: Fri, 27 Feb 2026 21:36:02 +0800 Subject: [PATCH 4/6] debug for test fail Signed-off-by: yenuo26 <410167048@qq.com> --- .buildkite/test-amd.yaml | 3 +- tests/benchmarks/test_serve_cli.py | 93 +++++++++++--------- tests/engine/test_async_omni_engine_abort.py | 2 +- 3 files changed, 53 insertions(+), 45 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index c24f5abc79..f6354758b4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -89,6 +89,7 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py + - pytest -s -v tests/engine/test_async_omni_engine_abort.py - label: "Omni Model Test Qwen3-Omni" timeout_in_minutes: 15 @@ -102,7 +103,7 @@ steps: - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py - - pytest -s -v tests/engine/test_async_omni_engine_abort.py + - label: "Diffusion Image Edit Test" timeout_in_minutes: 15 diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 77c7b49338..5c0575477c 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -1,9 +1,9 @@ -import subprocess +# import subprocess from pathlib import Path import pytest -from tests.conftest import OmniServer +# from tests.conftest import OmniServer from tests.utils import hardware_test models = ["Qwen/Qwen2.5-Omni-7B"] @@ -13,52 +13,59 @@ test_params = [(model, stage_config) for model in models for stage_config in stage_configs] -@pytest.fixture(scope="module") -def omni_server(request): - """Start vLLM-Omni server as a subprocess with actual model weights. - Uses session scope so the server starts only once for the entire test session. - Multi-stage initialization can take 10-20+ minutes. - """ - model, stage_config_path = request.param +# @pytest.fixture(scope="module") +# def omni_server(request): +# """Start vLLM-Omni server as a subprocess with actual model weights. +# Uses session scope so the server starts only once for the entire test session. +# Multi-stage initialization can take 10-20+ minutes. +# """ +# model, stage_config_path = request.param - print(f"Starting OmniServer with model: {model}") - print("This may take 10-20+ minutes for initialization...") +# print(f"Starting OmniServer with model: {model}") +# print("This may take 10-20+ minutes for initialization...") - with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server: - print("OmniServer started successfully") - yield server - print("OmniServer stopped") +# with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server: +# print("OmniServer started successfully") +# yield server +# print("OmniServer stopped") @pytest.mark.core_model @pytest.mark.benchmark @hardware_test(res={"cuda": "H100"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_bench_serve_chat(omni_server): - command = [ - "vllm", - "bench", - "serve", - "--omni", - "--model", - omni_server.model, - "--port", - str(omni_server.port), - "--dataset-name", - "random", - "--random-input-len", - "32", - "--random-output-len", - "4", - "--num-prompts", - "5", - "--endpoint", - "/v1/chat/completions", - "--backend", - "openai-chat-omni", - ] - result = subprocess.run(command, capture_output=True, text=True) - print(result.stdout) - print(result.stderr) +def test_bench_serve_chat(): + assert False, "for debug" - assert result.returncode == 0, f"Benchmark failed: {result.stderr}" + +# @pytest.mark.core_model +# @pytest.mark.benchmark +# @hardware_test(res={"cuda": "L4"}, num_cards=3) +# @pytest.mark.parametrize("omni_server", test_params, indirect=True) +# def test_bench_serve_chat(omni_server): +# command = [ +# "vllm", +# "bench", +# "serve", +# "--omni", +# "--model", +# omni_server.model, +# "--port", +# str(omni_server.port), +# "--dataset-name", +# "random", +# "--random-input-len", +# "32", +# "--random-output-len", +# "4", +# "--num-prompts", +# "5", +# "--endpoint", +# "/v1/chat/completions", +# "--backend", +# "openai-chat-omni", +# ] +# result = subprocess.run(command, capture_output=True, text=True) +# print(result.stdout) +# print(result.stderr) + +# assert result.returncode == 0, f"Benchmark failed: {result.stderr}" diff --git a/tests/engine/test_async_omni_engine_abort.py b/tests/engine/test_async_omni_engine_abort.py index 94544cbbec..a99c522c30 100644 --- a/tests/engine/test_async_omni_engine_abort.py +++ b/tests/engine/test_async_omni_engine_abort.py @@ -60,7 +60,7 @@ async def generate( @pytest.mark.core_model @pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1) @pytest.mark.asyncio async def test_abort(): with ExitStack() as after: From 97f645344cf2baad7d35752fbbe8abd400d2b685 Mon Sep 17 00:00:00 2001 From: yenuo26 <410167048@qq.com> Date: Fri, 27 Feb 2026 22:05:59 +0800 Subject: [PATCH 5/6] recover debug step Signed-off-by: yenuo26 <410167048@qq.com> --- tests/benchmarks/test_serve_cli.py | 87 +++++++++++------------------- 1 file changed, 31 insertions(+), 56 deletions(-) diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 5c0575477c..a48731934b 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -1,9 +1,8 @@ -# import subprocess +import subprocess from pathlib import Path import pytest -# from tests.conftest import OmniServer from tests.utils import hardware_test models = ["Qwen/Qwen2.5-Omni-7B"] @@ -13,59 +12,35 @@ test_params = [(model, stage_config) for model in models for stage_config in stage_configs] -# @pytest.fixture(scope="module") -# def omni_server(request): -# """Start vLLM-Omni server as a subprocess with actual model weights. -# Uses session scope so the server starts only once for the entire test session. -# Multi-stage initialization can take 10-20+ minutes. -# """ -# model, stage_config_path = request.param - -# print(f"Starting OmniServer with model: {model}") -# print("This may take 10-20+ minutes for initialization...") - -# with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server: -# print("OmniServer started successfully") -# yield server -# print("OmniServer stopped") - - @pytest.mark.core_model @pytest.mark.benchmark -@hardware_test(res={"cuda": "H100"}, num_cards=2) -def test_bench_serve_chat(): - assert False, "for debug" - - -# @pytest.mark.core_model -# @pytest.mark.benchmark -# @hardware_test(res={"cuda": "L4"}, num_cards=3) -# @pytest.mark.parametrize("omni_server", test_params, indirect=True) -# def test_bench_serve_chat(omni_server): -# command = [ -# "vllm", -# "bench", -# "serve", -# "--omni", -# "--model", -# omni_server.model, -# "--port", -# str(omni_server.port), -# "--dataset-name", -# "random", -# "--random-input-len", -# "32", -# "--random-output-len", -# "4", -# "--num-prompts", -# "5", -# "--endpoint", -# "/v1/chat/completions", -# "--backend", -# "openai-chat-omni", -# ] -# result = subprocess.run(command, capture_output=True, text=True) -# print(result.stdout) -# print(result.stderr) - -# assert result.returncode == 0, f"Benchmark failed: {result.stderr}" +@hardware_test(res={"cuda": "L4"}, num_cards=3) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_bench_serve_chat(omni_server): + command = [ + "vllm", + "bench", + "serve", + "--omni", + "--model", + omni_server.model, + "--port", + str(omni_server.port), + "--dataset-name", + "random", + "--random-input-len", + "32", + "--random-output-len", + "4", + "--num-prompts", + "5", + "--endpoint", + "/v1/chat/completions", + "--backend", + "openai-chat-omni", + ] + result = subprocess.run(command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + + assert result.returncode == 0, f"Benchmark failed: {result.stderr}" From 9f90f9711e823a0b919da29f2d7afe8c452c9ad1 Mon Sep 17 00:00:00 2001 From: yenuo26 <410167048@qq.com> Date: Sat, 28 Feb 2026 11:38:03 +0800 Subject: [PATCH 6/6] Add mm_processor_cache_gb configuration to stage YAML files - Set mm_processor_cache_gb to 0 in qwen2_5_omni_ci.yaml, qwen2_5_omni_multiconnector.yaml, and qwen2_5_omni.yaml. - Removed skip marker from test_qwen2_5_omni.py to enable the test. Signed-off-by: yenuo26 <410167048@qq.com> --- tests/e2e/stage_configs/qwen2_5_omni_ci.yaml | 1 + tests/examples/online_serving/test_qwen2_5_omni.py | 1 - vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml | 1 + .../stage_configs/qwen2_5_omni_multiconnector.yaml | 1 + 4 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml index 32b1ba15bd..59bdc9df97 100644 --- a/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml +++ b/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml @@ -22,6 +22,7 @@ stage_args: trust_remote_code: true engine_output_type: latent enable_prefix_caching: false + mm_processor_cache_gb: 0 is_comprehension: true final_output: true final_output_type: text diff --git a/tests/examples/online_serving/test_qwen2_5_omni.py b/tests/examples/online_serving/test_qwen2_5_omni.py index a4199f18e9..8e08d5bc50 100644 --- a/tests/examples/online_serving/test_qwen2_5_omni.py +++ b/tests/examples/online_serving/test_qwen2_5_omni.py @@ -236,7 +236,6 @@ def test_modality_control_003(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. -@pytest.mark.skip(reason="There is a known issue with stream error.") @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml index e6ed976607..3c05cffb72 100644 --- a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml @@ -19,6 +19,7 @@ stage_args: engine_output_type: latent enable_prefix_caching: false max_num_batched_tokens: 32768 + mm_processor_cache_gb: 0 is_comprehension: true final_output: true final_output_type: text diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml index f5d87aece4..5e379aa6b7 100644 --- a/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml @@ -18,6 +18,7 @@ stage_args: trust_remote_code: true engine_output_type: latent enable_prefix_caching: false + mm_processor_cache_gb: 0 is_comprehension: true final_output: true final_output_type: text