Skip to content

Commit a736e5f

Browse files
authored
[CI] Reduce Blackwell Fusion test runtime by filtering tests and only run all tests in nightly (#28074)
1 parent 9da9208 commit a736e5f

File tree

2 files changed

+31
-8
lines changed

2 files changed

+31
-8
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,9 @@ steps:
472472
- tests/compile
473473
commands:
474474
- pytest -v -s compile/test_full_graph.py
475-
- pytest -v -s compile/test_fusions_e2e.py
475+
# Limit to no custom ops to reduce running time
476+
# Wrap with quotes to escape yaml and avoid starting -k string with a -
477+
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
476478

477479
- label: Cudagraph test
478480
timeout_in_minutes: 20
@@ -929,6 +931,29 @@ steps:
929931
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
930932
# this runner has 2 GPUs available even though num_gpus=2 is not set
931933
- pytest -v -s tests/compile/test_fusion_all_reduce.py
934+
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
935+
# Wrap with quotes to escape yaml
936+
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
937+
938+
- label: Blackwell Fusion E2E Tests # 30 min
939+
timeout_in_minutes: 40
940+
working_dir: "/vllm-workspace/"
941+
gpu: b200
942+
optional: true
943+
num_gpus: 2
944+
source_file_dependencies:
945+
- csrc/quantization/fp4/
946+
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
947+
- vllm/v1/attention/backends/flashinfer.py
948+
- vllm/compilation/
949+
# can affect pattern matching
950+
- vllm/model_executor/layers/layernorm.py
951+
- vllm/model_executor/layers/activation.py
952+
- vllm/model_executor/layers/quantization/input_quant_fp8.py
953+
- tests/compile/test_fusions_e2e.py
954+
commands:
955+
- nvidia-smi
956+
# Run all e2e fusion tests
932957
- pytest -v -s tests/compile/test_fusions_e2e.py
933958

934959
- label: Blackwell GPT-OSS Eval

tests/compile/test_fusions_e2e.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,11 @@ class ModelBackendTestCase(NamedTuple):
5454

5555
MODELS_FP4 = [
5656
ModelBackendTestCase(
57-
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
57+
model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
5858
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
5959
backend=_Backend.FLASHINFER,
60-
attention_fusions=48,
61-
allreduce_fusions=96,
60+
attention_fusions=32,
61+
allreduce_fusions=65,
6262
),
6363
]
6464

@@ -95,8 +95,7 @@ class ModelBackendTestCase(NamedTuple):
9595
),
9696
]
9797

98-
# TODO(luka) test both in nightly
99-
CUSTOM_OPS_FP8 = ["-quant_fp8"] # , "+quant_fp8"]
98+
CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
10099

101100

102101
@pytest.mark.parametrize(
@@ -171,8 +170,7 @@ def test_attn_quant(
171170
assert int(matches[0]) == attention_fusions
172171

173172

174-
# TODO(luka) test both in nightly
175-
CUSTOM_OPS_RMS_NORM = ["-rms_norm"] # , "+rms_norm"]
173+
CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
176174

177175

178176
def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:

0 commit comments

Comments
 (0)