From e031a23cb156a649da81ca9c6cb41d0ede70898a Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Fri, 27 Feb 2026 16:26:35 +0800
Subject: [PATCH 1/6] [Update] Modify nightly test commands and update model
 configurations

- Updated the nightly test script to handle multiple pytest commands and capture exit statuses.
- Changed model from "Qwen/Qwen3-Omni-30B-A3B-Instruct" to "Qwen/Qwen2.5-Omni-7B" in benchmark tests.
- Updated stage configuration file for qwen2.5-omni.
- Adjusted prompt in the online serving test to specify a word limit for the answer.

Signed-off-by: yenuo26 <410167048@qq.com>
---
 .buildkite/test-nightly.yml                   |  9 ++++--
 tests/benchmarks/test_serve_cli.py            |  4 +--
 .../test_qwen3_omni_expansion.py              |  2 +-
 .../qwen2_5_omni_thinker_ci.yaml              | 31 +++++++++++++++++++
 tests/engine/test_async_omni_engine_abort.py  |  4 +--
 5 files changed, 43 insertions(+), 7 deletions(-)
 create mode 100644 tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index b70e73c489..30455468c2 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -5,8 +5,13 @@ steps:
     if: build.env("NIGHTLY") == "1"
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
-      - pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
+      - |
+        set +e
+        pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
+        EXIT1=$?
+        pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
+        EXIT2=$?
+        exit $((EXIT1 | EXIT2))
     agents:
       queue: "mithril-h100-pool"
     plugins:
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index 40244eb572..77c7b49338 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -6,8 +6,8 @@
 from tests.conftest import OmniServer
 from tests.utils import hardware_test
 
-models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
-stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")]
+models = ["Qwen/Qwen2.5-Omni-7B"]
+stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_ci.yaml")]
 
 # Create parameter combinations for model and stage config
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
index c4731ffc7d..e2f77af736 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
@@ -71,7 +71,7 @@ def get_system_prompt():
 
 def get_prompt(prompt_type="text_only"):
     prompts = {
-        "text_only": "What is the capital of China?",
+        "text_only": "What is the capital of China? Answer in 20 words.",
         "mix": "What is recited in the audio? What is in this image? What is in this video?",
         "text_video": "What is in this video? ",
         "text_image": "What is in this image? ",
diff --git a/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml
new file mode 100644
index 0000000000..3ebf914e88
--- /dev/null
+++ b/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml
@@ -0,0 +1,31 @@
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 16384
+      max_num_batched_tokens: 16384
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.9
+      skip_mm_profiling: true
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
diff --git a/tests/engine/test_async_omni_engine_abort.py b/tests/engine/test_async_omni_engine_abort.py
index b5f9bac991..94544cbbec 100644
--- a/tests/engine/test_async_omni_engine_abort.py
+++ b/tests/engine/test_async_omni_engine_abort.py
@@ -15,8 +15,8 @@
 
 SEED = 42
 
-stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_thinker_ci.yaml")
-model = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_thinker_ci.yaml")
+model = "Qwen/Qwen2.5-Omni-7B"
 
 
 async def generate(

From b1ece678f14f5bb256009518b649cbf8a2697055 Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Fri, 27 Feb 2026 18:32:53 +0800
Subject: [PATCH 2/6] [Refactor] Update Benchmark & Engine Test configuration

- Consolidated the Benchmark & Engine Test steps in both test-merge.yml and test-ready.yml.
- Changed the agent queue to "gpu_4_queue" and updated the Docker plugin configuration for better resource management.
- Removed the deprecated stage configuration file for Qwen3 Omni Thinker.

Signed-off-by: yenuo26 <410167048@qq.com>
---
 .buildkite/test-merge.yml                     | 52 ++++++--------
 .buildkite/test-ready.yml                     | 70 +++++++++----------
 .../stage_configs/qwen3_omni_thinker_ci.yaml  | 31 --------
 3 files changed, 53 insertions(+), 100 deletions(-)
 delete mode 100644 tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml

diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml
index 3609d266a2..2e6f883080 100644
--- a/.buildkite/test-merge.yml
+++ b/.buildkite/test-merge.yml
@@ -155,41 +155,31 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Benchmark & Engine Test with H100"
-    timeout_in_minutes: 15
+  - label: "Benchmark & Engine Test"
     depends_on: upload-merge-pipeline
     commands:
-      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/benchmarks/test_serve_cli.py
-      - pytest -s -v tests/engine/test_async_omni_engine_abort.py
+      - |
+        timeout 15m bash -c '
+                export VLLM_WORKER_MULTIPROC_METHOD=spawn
+                set +e
+                pytest -s -v tests/benchmarks/test_serve_cli.py
+                EXIT1=$?
+                pytest -s -v tests/engine/test_async_omni_engine_abort.py
+                EXIT2=$?
+                exit $((EXIT1 | EXIT2))
+        '
     agents:
-      queue: "mithril-h100-pool"
+      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:
-      - kubernetes:
-          podSpec:
-            containers:
-              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                resources:
-                  limits:
-                    nvidia.com/gpu: 2
-                volumeMounts:
-                  - name: devshm
-                    mountPath: /dev/shm
-                  - name: hf-cache
-                    mountPath: /root/.cache/huggingface
-                env:
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-            nodeSelector:
-              node.kubernetes.io/instance-type: gpu-h100-sxm
-            volumes:
-              - name: devshm
-                emptyDir:
-                  medium: Memory
-              - name: hf-cache
-                hostPath:
-                  path: /mnt/hf-cache
-                  type: DirectoryOrCreate
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          shm-size: "8gb"
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
 
   - label: "Omni Model Test"
     timeout_in_minutes: 15
diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml
index 3f46336d53..b64a78fcae 100644
--- a/.buildkite/test-ready.yml
+++ b/.buildkite/test-ready.yml
@@ -36,8 +36,12 @@ steps:
     commands:
       - |
         timeout 20m bash -c '
-          pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py &&
+          set +e
+          pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+          EXIT1=$?
           pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
+          EXIT2=$?
+          exit $((EXIT1 | EXIT2))
         '
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
@@ -117,43 +121,33 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  # - label: "Benchmark & Engine Test with H100"
-  #   depends_on: upload-ready-pipeline
-  #   commands:
-  #     - |
-  #       timeout 15m bash -c '
-  #         export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  #         pytest -s -v tests/benchmarks/test_serve_cli.py
-  #         pytest -s -v tests/engine/test_async_omni_engine_abort.py
-  #       '
-  #   agents:
-  #     queue: "mithril-h100-pool"
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           containers:
-  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #               resources:
-  #                 limits:
-  #                   nvidia.com/gpu: 2
-  #               volumeMounts:
-  #                 - name: devshm
-  #                   mountPath: /dev/shm
-  #                 - name: hf-cache
-  #                   mountPath: /root/.cache/huggingface
-  #               env:
-  #                 - name: HF_HOME
-  #                   value: /root/.cache/huggingface
-  #           nodeSelector:
-  #             node.kubernetes.io/instance-type: gpu-h100-sxm
-  #           volumes:
-  #             - name: devshm
-  #               emptyDir:
-  #                 medium: Memory
-  #             - name: hf-cache
-  #               hostPath:
-  #                 path: /mnt/hf-cache
-  #                 type: DirectoryOrCreate
+
+  - label: "Benchmark & Engine Test"
+    depends_on: upload-ready-pipeline
+    commands:
+      - |
+        timeout 15m bash -c '
+                export VLLM_WORKER_MULTIPROC_METHOD=spawn
+                set +e
+                pytest -s -v tests/benchmarks/test_serve_cli.py
+                EXIT1=$?
+                pytest -s -v tests/engine/test_async_omni_engine_abort.py
+                EXIT2=$?
+                exit $((EXIT1 | EXIT2))
+        '
+    agents:
+      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          shm-size: "8gb"
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+
 
   - label: "Omni Model Test"
     depends_on: upload-ready-pipeline
diff --git a/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml
deleted file mode 100644
index a6b4404d70..0000000000
--- a/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-# The following config has been verified on 2x H100-80G GPUs.
-stage_args:
-  - stage_id: 0
-    runtime:
-      devices: "0,1"
-      max_batch_size: 5
-    engine_args:
-      model_stage: thinker
-      model_arch: Qwen3OmniMoeForConditionalGeneration
-      worker_type: ar
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.6
-      enforce_eager: false
-      trust_remote_code: true
-      engine_output_type: latent  # Output hidden states for talker
-      distributed_executor_backend: "mp"
-      enable_prefix_caching: false
-      hf_config_name: thinker_config
-      tensor_parallel_size: 2
-      load_format: dummy
-    final_output: true
-    final_output_type: text
-    is_comprehension: true
-    default_sampling_params:
-      temperature: 0.4
-      top_p: 0.9
-      top_k: 1
-      max_tokens: 100
-      seed: 42
-      detokenize: True
-      repetition_penalty: 1.05

From 357ea20e99c755c09e0e656762597c3b58c43d13 Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Fri, 27 Feb 2026 20:56:42 +0800
Subject: [PATCH 3/6] Fix interpolation escaping errors

Signed-off-by: yenuo26 <410167048@qq.com>
---
 .buildkite/test-merge.yml   |  6 +++---
 .buildkite/test-nightly.yml |  6 +++---
 .buildkite/test-ready.yml   | 12 ++++++------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml
index 2e6f883080..5479f8ac1e 100644
--- a/.buildkite/test-merge.yml
+++ b/.buildkite/test-merge.yml
@@ -163,10 +163,10 @@ steps:
                 export VLLM_WORKER_MULTIPROC_METHOD=spawn
                 set +e
                 pytest -s -v tests/benchmarks/test_serve_cli.py
-                EXIT1=$?
+                EXIT1=$$?
                 pytest -s -v tests/engine/test_async_omni_engine_abort.py
-                EXIT2=$?
-                exit $((EXIT1 | EXIT2))
+                EXIT2=$$?
+                exit $$((EXIT1 | EXIT2))
         '
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 9b33d62d01..7b535033e0 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -8,10 +8,10 @@ steps:
       - |
         set +e
         pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
-        EXIT1=$?
+        EXIT1=$$?
         pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
-        EXIT2=$?
-        exit $((EXIT1 | EXIT2))
+        EXIT2=$$?
+        exit $$((EXIT1 | EXIT2))
     agents:
       queue: "mithril-h100-pool"
     plugins:
diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml
index ae36d72311..6c1b458b89 100644
--- a/.buildkite/test-ready.yml
+++ b/.buildkite/test-ready.yml
@@ -38,10 +38,10 @@ steps:
         timeout 20m bash -c '
           set +e
           pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
-          EXIT1=$?
+          EXIT1=$$?
           pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
-          EXIT2=$?
-          exit $((EXIT1 | EXIT2))
+          EXIT2=$$?
+          exit $$((EXIT1 | EXIT2))
         '
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
@@ -130,10 +130,10 @@ steps:
                 export VLLM_WORKER_MULTIPROC_METHOD=spawn
                 set +e
                 pytest -s -v tests/benchmarks/test_serve_cli.py
-                EXIT1=$?
+                EXIT1=$$?
                 pytest -s -v tests/engine/test_async_omni_engine_abort.py
-                EXIT2=$?
-                exit $((EXIT1 | EXIT2))
+                EXIT2=$$?
+                exit $$((EXIT1 | EXIT2))
         '
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU

From 17b0639040fb593f9a130982ee0522a3f0e4ba40 Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Fri, 27 Feb 2026 21:36:02 +0800
Subject: [PATCH 4/6] debug for test fail

Signed-off-by: yenuo26 <410167048@qq.com>
---
 .buildkite/test-amd.yaml                     |  3 +-
 tests/benchmarks/test_serve_cli.py           | 93 +++++++++++---------
 tests/engine/test_async_omni_engine_abort.py |  2 +-
 3 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index c24f5abc79..f6354758b4 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -89,6 +89,7 @@ steps:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+    - pytest -s -v tests/engine/test_async_omni_engine_abort.py
 
 - label: "Omni Model Test Qwen3-Omni"
   timeout_in_minutes: 15
@@ -102,7 +103,7 @@ steps:
     - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
     - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
     - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
-    - pytest -s -v tests/engine/test_async_omni_engine_abort.py
+
 
 - label: "Diffusion Image Edit Test"
   timeout_in_minutes: 15
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index 77c7b49338..5c0575477c 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -1,9 +1,9 @@
-import subprocess
+# import subprocess
 from pathlib import Path
 
 import pytest
 
-from tests.conftest import OmniServer
+# from tests.conftest import OmniServer
 from tests.utils import hardware_test
 
 models = ["Qwen/Qwen2.5-Omni-7B"]
@@ -13,52 +13,59 @@
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
 
 
-@pytest.fixture(scope="module")
-def omni_server(request):
-    """Start vLLM-Omni server as a subprocess with actual model weights.
-    Uses session scope so the server starts only once for the entire test session.
-    Multi-stage initialization can take 10-20+ minutes.
-    """
-    model, stage_config_path = request.param
+# @pytest.fixture(scope="module")
+# def omni_server(request):
+#     """Start vLLM-Omni server as a subprocess with actual model weights.
+#     Uses session scope so the server starts only once for the entire test session.
+#     Multi-stage initialization can take 10-20+ minutes.
+#     """
+#     model, stage_config_path = request.param
 
-    print(f"Starting OmniServer with model: {model}")
-    print("This may take 10-20+ minutes for initialization...")
+#     print(f"Starting OmniServer with model: {model}")
+#     print("This may take 10-20+ minutes for initialization...")
 
-    with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server:
-        print("OmniServer started successfully")
-        yield server
-        print("OmniServer stopped")
+#     with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server:
+#         print("OmniServer started successfully")
+#         yield server
+#         print("OmniServer stopped")
 
 
 @pytest.mark.core_model
 @pytest.mark.benchmark
 @hardware_test(res={"cuda": "H100"}, num_cards=2)
-@pytest.mark.parametrize("omni_server", test_params, indirect=True)
-def test_bench_serve_chat(omni_server):
-    command = [
-        "vllm",
-        "bench",
-        "serve",
-        "--omni",
-        "--model",
-        omni_server.model,
-        "--port",
-        str(omni_server.port),
-        "--dataset-name",
-        "random",
-        "--random-input-len",
-        "32",
-        "--random-output-len",
-        "4",
-        "--num-prompts",
-        "5",
-        "--endpoint",
-        "/v1/chat/completions",
-        "--backend",
-        "openai-chat-omni",
-    ]
-    result = subprocess.run(command, capture_output=True, text=True)
-    print(result.stdout)
-    print(result.stderr)
+def test_bench_serve_chat():
+    assert False, "for debug"
 
-    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
+
+# @pytest.mark.core_model
+# @pytest.mark.benchmark
+# @hardware_test(res={"cuda": "L4"}, num_cards=3)
+# @pytest.mark.parametrize("omni_server", test_params, indirect=True)
+# def test_bench_serve_chat(omni_server):
+# command = [
+#     "vllm",
+#     "bench",
+#     "serve",
+#     "--omni",
+#     "--model",
+#     omni_server.model,
+#     "--port",
+#     str(omni_server.port),
+#     "--dataset-name",
+#     "random",
+#     "--random-input-len",
+#     "32",
+#     "--random-output-len",
+#     "4",
+#     "--num-prompts",
+#     "5",
+#     "--endpoint",
+#     "/v1/chat/completions",
+#     "--backend",
+#     "openai-chat-omni",
+# ]
+# result = subprocess.run(command, capture_output=True, text=True)
+# print(result.stdout)
+# print(result.stderr)
+
+# assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/tests/engine/test_async_omni_engine_abort.py b/tests/engine/test_async_omni_engine_abort.py
index 94544cbbec..a99c522c30 100644
--- a/tests/engine/test_async_omni_engine_abort.py
+++ b/tests/engine/test_async_omni_engine_abort.py
@@ -60,7 +60,7 @@ async def generate(
 
 @pytest.mark.core_model
 @pytest.mark.omni
-@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1)
 @pytest.mark.asyncio
 async def test_abort():
     with ExitStack() as after:

From 97f645344cf2baad7d35752fbbe8abd400d2b685 Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Fri, 27 Feb 2026 22:05:59 +0800
Subject: [PATCH 5/6] recover debug step

Signed-off-by: yenuo26 <410167048@qq.com>
---
 tests/benchmarks/test_serve_cli.py | 87 +++++++++++-------------------
 1 file changed, 31 insertions(+), 56 deletions(-)

diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index 5c0575477c..a48731934b 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -1,9 +1,8 @@
-# import subprocess
+import subprocess
 from pathlib import Path
 
 import pytest
 
-# from tests.conftest import OmniServer
 from tests.utils import hardware_test
 
 models = ["Qwen/Qwen2.5-Omni-7B"]
@@ -13,59 +12,35 @@
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
 
 
-# @pytest.fixture(scope="module")
-# def omni_server(request):
-#     """Start vLLM-Omni server as a subprocess with actual model weights.
-#     Uses session scope so the server starts only once for the entire test session.
-#     Multi-stage initialization can take 10-20+ minutes.
-#     """
-#     model, stage_config_path = request.param
-
-#     print(f"Starting OmniServer with model: {model}")
-#     print("This may take 10-20+ minutes for initialization...")
-
-#     with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server:
-#         print("OmniServer started successfully")
-#         yield server
-#         print("OmniServer stopped")
-
-
 @pytest.mark.core_model
 @pytest.mark.benchmark
-@hardware_test(res={"cuda": "H100"}, num_cards=2)
-def test_bench_serve_chat():
-    assert False, "for debug"
-
-
-# @pytest.mark.core_model
-# @pytest.mark.benchmark
-# @hardware_test(res={"cuda": "L4"}, num_cards=3)
-# @pytest.mark.parametrize("omni_server", test_params, indirect=True)
-# def test_bench_serve_chat(omni_server):
-# command = [
-#     "vllm",
-#     "bench",
-#     "serve",
-#     "--omni",
-#     "--model",
-#     omni_server.model,
-#     "--port",
-#     str(omni_server.port),
-#     "--dataset-name",
-#     "random",
-#     "--random-input-len",
-#     "32",
-#     "--random-output-len",
-#     "4",
-#     "--num-prompts",
-#     "5",
-#     "--endpoint",
-#     "/v1/chat/completions",
-#     "--backend",
-#     "openai-chat-omni",
-# ]
-# result = subprocess.run(command, capture_output=True, text=True)
-# print(result.stdout)
-# print(result.stderr)
-
-# assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
+@hardware_test(res={"cuda": "L4"}, num_cards=3)
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_bench_serve_chat(omni_server):
+    command = [
+        "vllm",
+        "bench",
+        "serve",
+        "--omni",
+        "--model",
+        omni_server.model,
+        "--port",
+        str(omni_server.port),
+        "--dataset-name",
+        "random",
+        "--random-input-len",
+        "32",
+        "--random-output-len",
+        "4",
+        "--num-prompts",
+        "5",
+        "--endpoint",
+        "/v1/chat/completions",
+        "--backend",
+        "openai-chat-omni",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"

From 9f90f9711e823a0b919da29f2d7afe8c452c9ad1 Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Sat, 28 Feb 2026 11:38:03 +0800
Subject: [PATCH 6/6] Add mm_processor_cache_gb configuration to stage YAML
 files

- Set mm_processor_cache_gb to 0 in qwen2_5_omni_ci.yaml, qwen2_5_omni_multiconnector.yaml, and qwen2_5_omni.yaml.
- Removed skip marker from test_qwen2_5_omni.py to enable the test.

Signed-off-by: yenuo26 <410167048@qq.com>
---
 tests/e2e/stage_configs/qwen2_5_omni_ci.yaml                     | 1 +
 tests/examples/online_serving/test_qwen2_5_omni.py               | 1 -
 vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml         | 1 +
 .../stage_configs/qwen2_5_omni_multiconnector.yaml               | 1 +
 4 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml
index 32b1ba15bd..59bdc9df97 100644
--- a/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml
+++ b/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml
@@ -22,6 +22,7 @@ stage_args:
       trust_remote_code: true
       engine_output_type: latent
       enable_prefix_caching: false
+      mm_processor_cache_gb: 0
     is_comprehension: true
     final_output: true
     final_output_type: text
diff --git a/tests/examples/online_serving/test_qwen2_5_omni.py b/tests/examples/online_serving/test_qwen2_5_omni.py
index a4199f18e9..8e08d5bc50 100644
--- a/tests/examples/online_serving/test_qwen2_5_omni.py
+++ b/tests/examples/online_serving/test_qwen2_5_omni.py
@@ -236,7 +236,6 @@ def test_modality_control_003(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.skip(reason="There is a known issue with stream error.")
 @pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml
index e6ed976607..3c05cffb72 100644
--- a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml
+++ b/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml
@@ -19,6 +19,7 @@ stage_args:
       engine_output_type: latent
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
+      mm_processor_cache_gb: 0
     is_comprehension: true
     final_output: true
     final_output_type: text
diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml
index f5d87aece4..5e379aa6b7 100644
--- a/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml
+++ b/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml
@@ -18,6 +18,7 @@ stage_args:
       trust_remote_code: true
       engine_output_type: latent
       enable_prefix_caching: false
+      mm_processor_cache_gb: 0
     is_comprehension: true
     final_output: true
     final_output_type: text