diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index c24f5abc79..f6354758b4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -89,6 +89,7 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py + - pytest -s -v tests/engine/test_async_omni_engine_abort.py - label: "Omni Model Test Qwen3-Omni" timeout_in_minutes: 15 @@ -102,7 +103,7 @@ steps: - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py - - pytest -s -v tests/engine/test_async_omni_engine_abort.py + - label: "Diffusion Image Edit Test" timeout_in_minutes: 15 diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 3609d266a2..5479f8ac1e 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -155,41 +155,31 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Benchmark & Engine Test with H100" - timeout_in_minutes: 15 + - label: "Benchmark & Engine Test" depends_on: upload-merge-pipeline commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/benchmarks/test_serve_cli.py - - pytest -s -v tests/engine/test_async_omni_engine_abort.py + - | + timeout 15m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn + set +e + pytest -s -v tests/benchmarks/test_serve_cli.py + EXIT1=$$? + pytest -s -v tests/engine/test_async_omni_engine_abort.py + EXIT2=$$? + exit $$((EXIT1 | EXIT2)) + ' agents: - queue: "mithril-h100-pool" + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" - label: "Omni Model Test" timeout_in_minutes: 15 diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 1c22d29605..7b535033e0 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -5,8 +5,13 @@ steps: if: build.env("NIGHTLY") == "1" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model" - - pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" + - | + set +e + pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model" + EXIT1=$$? + pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" + EXIT2=$$? + exit $$((EXIT1 | EXIT2)) agents: queue: "mithril-h100-pool" plugins: diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index c36189c867..6c1b458b89 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -36,8 +36,12 @@ steps: commands: - | timeout 20m bash -c ' - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py && + set +e + pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py + EXIT1=$$? pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py + EXIT2=$$? + exit $$((EXIT1 | EXIT2)) ' agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU @@ -117,43 +121,33 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - # - label: "Benchmark & Engine Test with H100" - # depends_on: upload-ready-pipeline - # commands: - # - | - # timeout 15m bash -c ' - # export VLLM_WORKER_MULTIPROC_METHOD=spawn - # pytest -s -v tests/benchmarks/test_serve_cli.py - # pytest -s -v tests/engine/test_async_omni_engine_abort.py - # ' - # agents: - # queue: "mithril-h100-pool" - # plugins: - # - kubernetes: - # podSpec: - # containers: - # - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - # resources: - # limits: - # nvidia.com/gpu: 2 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # - name: hf-cache - # mountPath: /root/.cache/huggingface - # env: - # - name: HF_HOME - # value: /root/.cache/huggingface - # nodeSelector: - # node.kubernetes.io/instance-type: gpu-h100-sxm - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - # - name: hf-cache - # hostPath: - # path: /mnt/hf-cache - # type: DirectoryOrCreate + + - label: "Benchmark & Engine Test" + depends_on: upload-ready-pipeline + commands: + - | + timeout 15m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn + set +e + pytest -s -v tests/benchmarks/test_serve_cli.py + EXIT1=$$? + pytest -s -v tests/engine/test_async_omni_engine_abort.py + EXIT2=$$? + exit $$((EXIT1 | EXIT2)) + ' + agents: + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Omni Model Test" depends_on: upload-ready-pipeline diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 40244eb572..a48731934b 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -3,36 +3,18 @@ import pytest -from tests.conftest import OmniServer from tests.utils import hardware_test -models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] -stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")] +models = ["Qwen/Qwen2.5-Omni-7B"] +stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_ci.yaml")] # Create parameter combinations for model and stage config test_params = [(model, stage_config) for model in models for stage_config in stage_configs] -@pytest.fixture(scope="module") -def omni_server(request): - """Start vLLM-Omni server as a subprocess with actual model weights. - Uses session scope so the server starts only once for the entire test session. - Multi-stage initialization can take 10-20+ minutes. - """ - model, stage_config_path = request.param - - print(f"Starting OmniServer with model: {model}") - print("This may take 10-20+ minutes for initialization...") - - with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server: - print("OmniServer started successfully") - yield server - print("OmniServer stopped") - - @pytest.mark.core_model @pytest.mark.benchmark -@hardware_test(res={"cuda": "H100"}, num_cards=2) +@hardware_test(res={"cuda": "L4"}, num_cards=3) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_bench_serve_chat(omni_server): command = [ diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index c4731ffc7d..e2f77af736 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -71,7 +71,7 @@ def get_system_prompt(): def get_prompt(prompt_type="text_only"): prompts = { - "text_only": "What is the capital of China?", + "text_only": "What is the capital of China? Answer in 20 words.", "mix": "What is recited in the audio? What is in this image? What is in this video?", "text_video": "What is in this video? ", "text_image": "What is in this image? ", diff --git a/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml index 32b1ba15bd..59bdc9df97 100644 --- a/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml +++ b/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml @@ -22,6 +22,7 @@ stage_args: trust_remote_code: true engine_output_type: latent enable_prefix_caching: false + mm_processor_cache_gb: 0 is_comprehension: true final_output: true final_output_type: text diff --git a/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml new file mode 100644 index 0000000000..3ebf914e88 --- /dev/null +++ b/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml @@ -0,0 +1,31 @@ +stage_args: + - stage_id: 0 + runtime: + process: true # Run this stage in a separate process + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + max_batch_size: 1 + engine_args: + model_stage: thinker + model_arch: Qwen2_5OmniForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + max_model_len: 16384 + max_num_batched_tokens: 16384 + max_num_seqs: 1 + gpu_memory_utilization: 0.9 + skip_mm_profiling: true + enforce_eager: true # Now we only support eager mode + trust_remote_code: true + engine_output_type: latent + enable_prefix_caching: false + is_comprehension: true + final_output: true + final_output_type: text + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.1 diff --git a/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml deleted file mode 100644 index a6b4404d70..0000000000 --- a/tests/e2e/stage_configs/qwen3_omni_thinker_ci.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# The following config has been verified on 2x H100-80G GPUs. -stage_args: - - stage_id: 0 - runtime: - devices: "0,1" - max_batch_size: 5 - engine_args: - model_stage: thinker - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - hf_config_name: thinker_config - tensor_parallel_size: 2 - load_format: dummy - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 100 - seed: 42 - detokenize: True - repetition_penalty: 1.05 diff --git a/tests/engine/test_async_omni_engine_abort.py b/tests/engine/test_async_omni_engine_abort.py index b5f9bac991..a99c522c30 100644 --- a/tests/engine/test_async_omni_engine_abort.py +++ b/tests/engine/test_async_omni_engine_abort.py @@ -15,8 +15,8 @@ SEED = 42 -stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_thinker_ci.yaml") -model = "Qwen/Qwen3-Omni-30B-A3B-Instruct" +stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_thinker_ci.yaml") +model = "Qwen/Qwen2.5-Omni-7B" async def generate( @@ -60,7 +60,7 @@ async def generate( @pytest.mark.core_model @pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1) @pytest.mark.asyncio async def test_abort(): with ExitStack() as after: diff --git a/tests/examples/online_serving/test_qwen2_5_omni.py b/tests/examples/online_serving/test_qwen2_5_omni.py index a4199f18e9..8e08d5bc50 100644 --- a/tests/examples/online_serving/test_qwen2_5_omni.py +++ b/tests/examples/online_serving/test_qwen2_5_omni.py @@ -236,7 +236,6 @@ def test_modality_control_003(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. -@pytest.mark.skip(reason="There is a known issue with stream error.") @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml index e6ed976607..3c05cffb72 100644 --- a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml @@ -19,6 +19,7 @@ stage_args: engine_output_type: latent enable_prefix_caching: false max_num_batched_tokens: 32768 + mm_processor_cache_gb: 0 is_comprehension: true final_output: true final_output_type: text diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml index f5d87aece4..5e379aa6b7 100644 --- a/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml @@ -18,6 +18,7 @@ stage_args: trust_remote_code: true engine_output_type: latent enable_prefix_caching: false + mm_processor_cache_gb: 0 is_comprehension: true final_output: true final_output_type: text