[CI] Reduce Blackwell Fusion test runtime by filtering tests and only run all tests in nightly (#28074)

Copilot · web-flow · commit a736e5ff770b · 2025-11-07T15:58:16.000+08:00
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -472,7 +472,9 @@ steps:
   - tests/compile
   commands:
   - pytest -v -s compile/test_full_graph.py
-  - pytest -v -s compile/test_fusions_e2e.py
+    # Limit to no custom ops to reduce running time 
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -929,6 +931,29 @@ steps:
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml 
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusions_e2e.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
     - pytest -v -s tests/compile/test_fusions_e2e.py
 
 - label: Blackwell GPT-OSS Eval
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py
@@ -54,11 +54,11 @@ class ModelBackendTestCase(NamedTuple):
 
     MODELS_FP4 = [
         ModelBackendTestCase(
-            model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+            model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
             backend=_Backend.FLASHINFER,
-            attention_fusions=48,
-            allreduce_fusions=96,
+            attention_fusions=32,
+            allreduce_fusions=65,
         ),
     ]
 
@@ -95,8 +95,7 @@ class ModelBackendTestCase(NamedTuple):
         ),
     ]
 
-# TODO(luka) test both in nightly
-CUSTOM_OPS_FP8 = ["-quant_fp8"]  # , "+quant_fp8"]
+CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
 
 
 @pytest.mark.parametrize(
@@ -171,8 +170,7 @@ def test_attn_quant(
     assert int(matches[0]) == attention_fusions
 
 
-# TODO(luka) test both in nightly
-CUSTOM_OPS_RMS_NORM = ["-rms_norm"]  # , "+rms_norm"]
+CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
 
 
 def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]: