@@ -472,7 +472,9 @@ steps:
472472 - tests/compile
473473 commands :
474474 - pytest -v -s compile/test_full_graph.py
475- - pytest -v -s compile/test_fusions_e2e.py
475+ # Limit to no custom ops to reduce running time
476+ # Wrap with quotes to escape yaml and avoid starting -k string with a -
477+ - " pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
476478
477479- label : Cudagraph test
478480 timeout_in_minutes : 20
@@ -929,6 +931,29 @@ steps:
929931 - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
930932 # this runner has 2 GPUs available even though num_gpus=2 is not set
931933 - pytest -v -s tests/compile/test_fusion_all_reduce.py
934+ # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
935+ # Wrap with quotes to escape yaml
936+ - " pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
937+
938+ - label : Blackwell Fusion E2E Tests # 30 min
939+ timeout_in_minutes : 40
940+ working_dir : " /vllm-workspace/"
941+ gpu : b200
942+ optional : true
943+ num_gpus : 2
944+ source_file_dependencies :
945+ - csrc/quantization/fp4/
946+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
947+ - vllm/v1/attention/backends/flashinfer.py
948+ - vllm/compilation/
949+ # can affect pattern matching
950+ - vllm/model_executor/layers/layernorm.py
951+ - vllm/model_executor/layers/activation.py
952+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
953+ - tests/compile/test_fusions_e2e.py
954+ commands :
955+ - nvidia-smi
956+ # Run all e2e fusion tests
932957 - pytest -v -s tests/compile/test_fusions_e2e.py
933958
934959- label : Blackwell GPT-OSS Eval
0 commit comments