[CI] Modify some CI test cases to run on L4 environment to reduce H100 resource usage. (#1543)

yenuo26 · web-flow · commit cd2234a1bf25 · 2026-02-28T14:41:51.000+08:00
Signed-off-by: yenuo26 &lt;410167048@qq.com&gt;
Signed-off-by: wangyu &lt;53896905+yenuo26@users.noreply.github.com&gt;
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -89,6 +89,7 @@ steps:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+    - pytest -s -v tests/engine/test_async_omni_engine_abort.py
 
 - label: "Omni Model Test Qwen3-Omni"
   timeout_in_minutes: 15
@@ -102,7 +103,7 @@ steps:
     - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
     - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
     - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
-    - pytest -s -v tests/engine/test_async_omni_engine_abort.py
+
 
 - label: "Diffusion Image Edit Test"
   timeout_in_minutes: 15
diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml
@@ -155,41 +155,31 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Benchmark & Engine Test with H100"
-    timeout_in_minutes: 15
+  - label: "Benchmark & Engine Test"
     depends_on: upload-merge-pipeline
     commands:
-      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/benchmarks/test_serve_cli.py
-      - pytest -s -v tests/engine/test_async_omni_engine_abort.py
+      - |
+        timeout 15m bash -c '
+                export VLLM_WORKER_MULTIPROC_METHOD=spawn
+                set +e
+                pytest -s -v tests/benchmarks/test_serve_cli.py
+                EXIT1=$$?
+                pytest -s -v tests/engine/test_async_omni_engine_abort.py
+                EXIT2=$$?
+                exit $$((EXIT1 | EXIT2))
+        '
     agents:
-      queue: "mithril-h100-pool"
+      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:
-      - kubernetes:
-          podSpec:
-            containers:
-              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                resources:
-                  limits:
-                    nvidia.com/gpu: 2
-                volumeMounts:
-                  - name: devshm
-                    mountPath: /dev/shm
-                  - name: hf-cache
-                    mountPath: /root/.cache/huggingface
-                env:
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-            nodeSelector:
-              node.kubernetes.io/instance-type: gpu-h100-sxm
-            volumes:
-              - name: devshm
-                emptyDir:
-                  medium: Memory
-              - name: hf-cache
-                hostPath:
-                  path: /mnt/hf-cache
-                  type: DirectoryOrCreate
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          shm-size: "8gb"
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
 
   - label: "Omni Model Test"
     timeout_in_minutes: 15
diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
@@ -5,8 +5,13 @@ steps:
     if: build.env("NIGHTLY") == "1"
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
-      - pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
+      - |
+        set +e
+        pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
+        EXIT1=$$?
+        pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
+        EXIT2=$$?
+        exit $$((EXIT1 | EXIT2))
     agents:
       queue: "mithril-h100-pool"
     plugins:
diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml
@@ -36,8 +36,12 @@ steps:
     commands:
       - |
         timeout 20m bash -c '
-          pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py &&
+          set +e
+          pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+          EXIT1=$$?
           pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
+          EXIT2=$$?
+          exit $$((EXIT1 | EXIT2))
         '
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
@@ -117,43 +121,33 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  # - label: "Benchmark & Engine Test with H100"
-  #   depends_on: upload-ready-pipeline
-  #   commands:
-  #     - |
-  #       timeout 15m bash -c '
-  #         export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  #         pytest -s -v tests/benchmarks/test_serve_cli.py
-  #         pytest -s -v tests/engine/test_async_omni_engine_abort.py
-  #       '
-  #   agents:
-  #     queue: "mithril-h100-pool"
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           containers:
-  #             - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #               resources:
-  #                 limits:
-  #                   nvidia.com/gpu: 2
-  #               volumeMounts:
-  #                 - name: devshm
-  #                   mountPath: /dev/shm
-  #                 - name: hf-cache
-  #                   mountPath: /root/.cache/huggingface
-  #               env:
-  #                 - name: HF_HOME
-  #                   value: /root/.cache/huggingface
-  #           nodeSelector:
-  #             node.kubernetes.io/instance-type: gpu-h100-sxm
-  #           volumes:
-  #             - name: devshm
-  #               emptyDir:
-  #                 medium: Memory
-  #             - name: hf-cache
-  #               hostPath:
-  #                 path: /mnt/hf-cache
-  #                 type: DirectoryOrCreate
+
+  - label: "Benchmark & Engine Test"
+    depends_on: upload-ready-pipeline
+    commands:
+      - |
+        timeout 15m bash -c '
+                export VLLM_WORKER_MULTIPROC_METHOD=spawn
+                set +e
+                pytest -s -v tests/benchmarks/test_serve_cli.py
+                EXIT1=$$?
+                pytest -s -v tests/engine/test_async_omni_engine_abort.py
+                EXIT2=$$?
+                exit $$((EXIT1 | EXIT2))
+        '
+    agents:
+      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          shm-size: "8gb"
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+
 
   - label: "Omni Model Test"
     depends_on: upload-ready-pipeline
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
@@ -3,36 +3,18 @@
 
 import pytest
 
-from tests.conftest import OmniServer
 from tests.utils import hardware_test
 
-models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
-stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")]
+models = ["Qwen/Qwen2.5-Omni-7B"]
+stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_ci.yaml")]
 
 # Create parameter combinations for model and stage config
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
 
 
-@pytest.fixture(scope="module")
-def omni_server(request):
-    """Start vLLM-Omni server as a subprocess with actual model weights.
-    Uses session scope so the server starts only once for the entire test session.
-    Multi-stage initialization can take 10-20+ minutes.
-    """
-    model, stage_config_path = request.param
-
-    print(f"Starting OmniServer with model: {model}")
-    print("This may take 10-20+ minutes for initialization...")
-
-    with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server:
-        print("OmniServer started successfully")
-        yield server
-        print("OmniServer stopped")
-
-
 @pytest.mark.core_model
 @pytest.mark.benchmark
-@hardware_test(res={"cuda": "H100"}, num_cards=2)
+@hardware_test(res={"cuda": "L4"}, num_cards=3)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_bench_serve_chat(omni_server):
     command = [
diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
@@ -71,7 +71,7 @@ def get_system_prompt():
 
 def get_prompt(prompt_type="text_only"):
     prompts = {
-        "text_only": "What is the capital of China?",
+        "text_only": "What is the capital of China? Answer in 20 words.",
         "mix": "What is recited in the audio? What is in this image? What is in this video?",
         "text_video": "What is in this video? ",
         "text_image": "What is in this image? ",
diff --git a/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml
@@ -22,6 +22,7 @@ stage_args:
       trust_remote_code: true
       engine_output_type: latent
       enable_prefix_caching: false
+      mm_processor_cache_gb: 0
     is_comprehension: true
     final_output: true
     final_output_type: text
diff --git a/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml
@@ -0,0 +1,31 @@
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 16384
+      max_num_batched_tokens: 16384
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.9
+      skip_mm_profiling: true
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
diff --git a/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml
diff --git a/tests/engine/test_async_omni_engine_abort.py b/tests/engine/test_async_omni_engine_abort.py
@@ -15,8 +15,8 @@
 
 SEED = 42
 
-stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_thinker_ci.yaml")
-model = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_thinker_ci.yaml")
+model = "Qwen/Qwen2.5-Omni-7B"
 
 
 async def generate(
@@ -60,7 +60,7 @@ async def generate(
 
 @pytest.mark.core_model
 @pytest.mark.omni
-@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1)
 @pytest.mark.asyncio
 async def test_abort():
     with ExitStack() as after:
diff --git a/tests/examples/online_serving/test_qwen2_5_omni.py b/tests/examples/online_serving/test_qwen2_5_omni.py
@@ -236,7 +236,6 @@ def test_modality_control_003(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.skip(reason="There is a known issue with stream error.")
 @pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml
@@ -19,6 +19,7 @@ stage_args:
       engine_output_type: latent
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
+      mm_processor_cache_gb: 0
     is_comprehension: true
     final_output: true
     final_output_type: text
diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml
@@ -18,6 +18,7 @@ stage_args:
       trust_remote_code: true
       engine_output_type: latent
       enable_prefix_caching: false
+      mm_processor_cache_gb: 0
     is_comprehension: true
     final_output: true
     final_output_type: text