vllm-project
diff --git a/‎.buildkite/pipeline.yml‎
Lines changed: 2 additions & 14 deletions b/‎.buildkite/pipeline.yml‎
Lines changed: 2 additions & 14 deletions
diff --git a/‎.buildkite/test-amd.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/test-amd.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/contributing/ci/tests_markers.md‎
Lines changed: 28 additions & 32 deletions b/‎docs/contributing/ci/tests_markers.md‎
Lines changed: 28 additions & 32 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/benchmarks/test_serve_cli.py‎
Lines changed: 4 additions & 0 deletions b/‎tests/benchmarks/test_serve_cli.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/diffusion/cache/test_cache_backends.py‎
Lines changed: 2 additions & 0 deletions b/‎tests/diffusion/cache/test_cache_backends.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/diffusion/lora/test_base_linear.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/diffusion/lora/test_base_linear.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/diffusion/lora/test_lora_manager.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/diffusion/lora/test_lora_manager.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/diffusion/test_diffusion_worker.py‎
Lines changed: 2 additions & 0 deletions b/‎tests/diffusion/test_diffusion_worker.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/distributed/omni_connectors/test_kv_flow.py‎
Lines changed: 4 additions & 22 deletions b/‎tests/distributed/omni_connectors/test_kv_flow.py‎
Lines changed: 4 additions & 22 deletions
@@ -20,19 +20,7 @@ steps:
   - label: "Simple Unit Test"
     depends_on: image-build
     commands:
-    - |
-      pytest -v -s \
-        tests/entrypoints/ \
-        tests/diffusion/cache/ \
-        tests/diffusion/lora/ \
-        tests/model_executor/models/qwen2_5_omni/test_audio_length.py \
-        tests/worker/ \
-        tests/distributed/omni_connectors/test_kv_flow.py \
-        --cov=vllm_omni \
-        --cov-branch \
-        --cov-report=term-missing \
-        --cov-report=html \
-        --cov-report=xml
+      - "pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml"
     agents:
       queue: "gpu_1_queue"
     plugins:
@@ -118,7 +106,7 @@ steps:
     timeout_in_minutes: 15
     depends_on: image-build
     commands:
-      - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+      - pytest -s -v -m 'core_model and cache and diffusion and not distributed_cuda and L4'
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
     plugins:
 
@@ -44,7 +44,7 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+    - pytest -s -v -m 'core_model and cache and diffusion and not distributed_rocm and MI325'
 
 - label: "Diffusion Sequence Parallelism Test"
   timeout_in_minutes: 20
 
@@ -5,33 +5,33 @@ By adding markers before test functions, tests can later be executed uniformly b
 ## Current Markers
 Defined in `pyproject.toml`:
 
-| Marker             | Description                                             |
-| ------------------ | ------------------------------------------------------- |
-| `core_model`       | Core model tests (run in each PR)                       |
-| `diffusion`        | Diffusion model tests                                   |
-| `omni`             | Omni model tests                                        |
-| `cache`            | Cache backend tests                                     |
-| `parallel`         | Parallelism/distributed tests                           |
-| `cpu`              | Tests that run on CPU                                   |
-| `gpu`              | Tests that run on GPU (auto-added)                      |
-| `cuda`             | Tests that run on CUDA (auto-added)                     |
-| `rocm`             | Tests that run on AMD/ROCm (auto-added)                 |
-| `npu`              | Tests that run on NPU/Ascend (auto-added)               |
-| `H100`             | Tests that require H100 GPU                             |
-| `L4`               | Tests that require L4 GPU                               |
-| `MI325`            | Tests that require MI325 GPU (AMD/ROCm)                 |
-| `A2`               | Tests that require A2 NPU                               |
-| `A3`               | Tests that require A3 NPU                               |
-| `distributed_cuda` | Tests that require multi cards on CUDA platform         |
-| `distributed_rocm` | Tests that require multi cards on ROCm platform         |
-| `distributed_npu`  | Tests that require multi cards on NPU platform          |
-| `skipif_cuda`      | Skip if the num of CUDA cards is less than the required |
-| `skipif_rocm`      | Skip if the num of ROCm cards is less than the required |
-| `skipif_npu`       | Skip if the num of NPU cards is less than the required  |
-| `slow`             | Slow tests (may skip in quick CI)                       |
-| `benchmark`        | Benchmark tests                                         |
-
-For those markers shown as auto-added, they will be added by the `@hardware_test` decorator.
+| Marker             | Description                                               |
+| ------------------ | --------------------------------------------------------- |
+| `core_model`       | Core model tests (run in each PR)                         |
+| `diffusion`        | Diffusion model tests                                     |
+| `omni`             | Omni model tests                                          |
+| `cache`            | Cache backend tests                                       |
+| `parallel`         | Parallelism/distributed tests                             |
+| `cpu`              | Tests that run on CPU                                     |
+| `gpu`              | Tests that run on GPU *                                   |
+| `cuda`             | Tests that run on CUDA *                                  |
+| `rocm`             | Tests that run on AMD/ROCm *                              |
+| `npu`              | Tests that run on NPU/Ascend *                            |
+| `H100`             | Tests that require H100 GPU  *                            |
+| `L4`               | Tests that require L4 GPU *                               |
+| `MI325`            | Tests that require MI325 GPU (AMD/ROCm) *                 |
+| `A2`               | Tests that require A2 NPU *                               |
+| `A3`               | Tests that require A3 NPU *                               |
+| `distributed_cuda` | Tests that require multi cards on CUDA platform *         |
+| `distributed_rocm` | Tests that require multi cards on ROCm platform  *        |
+| `distributed_npu`  | Tests that require multi cards on NPU platform  *         |
+| `skipif_cuda`      | Skip if the num of CUDA cards is less than the required * |
+| `skipif_rocm`      | Skip if the num of ROCm cards is less than the required * |
+| `skipif_npu`       | Skip if the num of NPU cards is less than the required *  |
+| `slow`             | Slow tests (may skip in quick CI)                         |
+| `benchmark`        | Benchmark tests                                           |
+
+\* Means those markers are auto-added, and they will be added by the `@hardware_test` decorator.
 
 ### Example usage for markers
 
@@ -71,10 +71,7 @@ This decorator is intended to make hardware-aware, cross-platform test authoring
    Support for `skipif_rocm` and `skipif_npu` will be implemented later.
 
 
-5. **Runs each test in a new process**  
-   Automatically wraps the distributed test with a decorator (`@create_new_process_for_each_test`) to ensure isolation and compatibility with multi-process hardware backends.
-
-6. **Works with pytest filtering**  
+5. **Works with pytest filtering**  
    Allows tests to be filtered and selected at runtime using standard pytest marker expressions (e.g., `-m "distributed_cuda and L4"`).
 
 #### Example usage for decorator
@@ -94,7 +91,6 @@ This decorator is intended to make hardware-aware, cross-platform test authoring
     ```
 - `res` must be a dict; supported resources: CUDA (L4/H100), ROCm (MI325), NPU (A2/A3)
 - `num_cards` can be int (all platforms) or dict (per platform); defaults to 1 when missing
-- `hardware_test` automatically applies `@create_new_process_for_each_test` for distributed tests.
 - Distributed markers (`distributed_cuda`, `distributed_rocm`, `distributed_npu`) are auto-added for multi-card cases
 - Filtering examples:
     - CUDA only: `pytest -m "distributed_cuda and L4"`
 
@@ -175,6 +175,10 @@ markers = [
     "slow: Slow tests (may skip in quick CI)",
     "benchmark: Benchmark tests",
 ]
+filterwarnings = [
+    "ignore:.*does not have '__test__' attribute.*:UserWarning",
+    "ignore:.*does not have '__bases__' attribute.*:UserWarning",
+]
 
 [tool.typos.default]
 extend-ignore-identifiers-re = [
 
@@ -4,6 +4,7 @@
 import pytest
 
 from tests.conftest import OmniServer
+from tests.utils import hardware_test
 
 models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
 stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")]
@@ -29,6 +30,9 @@ def omni_server(request):
         print("OmniServer stopped")
 
 
+@pytest.mark.core_model
+@pytest.mark.benchmark
+@hardware_test(res={"cuda": "H100"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_bench_serve_chat(omni_server):
     command = [
 
@@ -22,6 +22,8 @@
 from vllm_omni.diffusion.cache.teacache.backend import TeaCacheBackend
 from vllm_omni.diffusion.data import DiffusionCacheConfig
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 class TestCacheDiTBackend:
     """Test CacheDiTBackend implementation."""
 
@@ -5,10 +5,13 @@
 
 from dataclasses import dataclass
 
+import pytest
 import torch
 
 from vllm_omni.diffusion.lora.layers.base_linear import DiffusionBaseLinearLayerWithLoRA
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 @dataclass
 class _DummyLoRAConfig:
 
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import pytest
 import torch
 from vllm.lora.lora_weights import LoRALayerWeights
 from vllm.lora.utils import get_supported_lora_modules
@@ -11,6 +12,8 @@
 from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager
 from vllm_omni.lora.request import LoRARequest
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 class _DummyLoRALayer:
     def __init__(self, n_slices: int, output_slices: tuple[int, ...]):
 
@@ -17,6 +17,8 @@
 
 from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker
 
+pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
+
 
 @pytest.fixture
 def mock_od_config():
 
@@ -1,14 +1,15 @@
 import pytest
 import torch
 
-from tests.utils import hardware_test
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.distributed.omni_connectors.kv_transfer_manager import (
     OmniKVCacheConfig,
     OmniKVTransferManager,
 )
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.cache]
+
 
 class MockConnector:
     def __init__(self):
@@ -58,11 +59,6 @@ def common_constants():
     }
 
 
-@pytest.mark.cache
-@hardware_test(
-    res={"cuda": "L4"},
-    num_cards=2,
-)
 def test_manager_extraction(kv_config, mock_connector, common_constants):
     """Test extraction and sending logic in OmniKVTransferManager."""
     num_layers = common_constants["num_layers"]
@@ -109,11 +105,6 @@ def test_manager_extraction(kv_config, mock_connector, common_constants):
     assert data["layer_blocks"]["key_cache"][0].shape == expected_shape
 
 
-@pytest.mark.cache
-@hardware_test(
-    res={"cuda": "L4"},
-    num_cards=2,
-)
 def test_manager_reception(kv_config, mock_connector, common_constants):
     """Test reception and injection logic in OmniKVTransferManager."""
     num_layers = common_constants["num_layers"]
@@ -171,11 +162,6 @@ def test_manager_reception(kv_config, mock_connector, common_constants):
     assert req.kv_metadata["seq_len"] == seq_len
 
 
-@pytest.mark.cache
-@hardware_test(
-    res={"cuda": "L4"},
-    num_cards=2,
-)
 def test_integration_flow(common_constants):
     """Simulate extraction -> connector -> reception."""
     num_layers = common_constants["num_layers"]
@@ -211,7 +197,8 @@ def test_integration_flow(common_constants):
         recv_timeout=1.0,
     )
     receiver_manager = OmniKVTransferManager(receiver_config)
-    receiver_manager._connector = connector  # Share the same mock connector instance
+    # Share the same mock connector instance
+    receiver_manager._connector = connector
 
     req = OmniDiffusionRequest(
         prompts=["test_integ"],
@@ -228,11 +215,6 @@ def test_integration_flow(common_constants):
     assert req.kv_metadata["seq_len"] == 10
 
 
-@pytest.mark.cache
-@hardware_test(
-    res={"cuda": "L4"},
-    num_cards=2,
-)
 def test_manager_extraction_no_connector(kv_config, common_constants):
     """Test extraction when connector is unavailable (should still return IDs)."""
     block_size = common_constants["block_size"]