[torch.compile][CI] Add back attn fusion on hopper/ada (vllm-project#32940)

ProExpertProg · web-flow · commit bbbd696af9eb · 2026-01-23T16:49:20.000Z
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
@@ -18,7 +18,7 @@
     is_blackwell,
     run_model,
 )
-from tests.utils import cuda_device_count_stateless, flat_product
+from tests.utils import flat_product
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention.layer import Attention
@@ -265,13 +265,13 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
     HEADS = [(64, 8), (40, 8)]
     PATTERN_TEST_MODELS_FP8 = [
         (
-            "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+            "RedHatAI/Meta-Llama-3.1-8B-FP8",
             TestAttentionFp8StaticQuantPatternModel,
         )
     ]
     PATTERN_TEST_MODELS_FP4 = [
         (
-            "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+            "nvidia/Llama-3.1-8B-Instruct-NVFP4",
             TestAttentionNvfp4QuantPatternModel,
         )
     ]
@@ -331,9 +331,8 @@ def test_attention_quant_pattern(
     if backend == AttentionBackendEnum.FLASHINFER and (
         not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
     ):
+        # This also captures the FP4 case
         pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
-    if "Llama-4-Scout" in model_name and cuda_device_count_stateless() < 2:
-        pytest.skip("Llama-4-Scout requires at least 2 GPUs")
 
     custom_ops_list = custom_ops.split(",") if custom_ops else []
 

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`is_blackwell,`
`19`	`19`	`run_model,`
`20`	`20`	`)`
`21`		`-from tests.utils import cuda_device_count_stateless, flat_product`
	`21`	`+from tests.utils import flat_product`
`22`	`22`	`from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata`
`23`	`23`	`from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant`
`24`	`24`	`from vllm.attention.layer import Attention`
`@@ -265,13 +265,13 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):`
`265`	`265`	`HEADS = [(64, 8), (40, 8)]`
`266`	`266`	`PATTERN_TEST_MODELS_FP8 = [`
`267`	`267`	`(`
`268`		`- "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",`
	`268`	`+ "RedHatAI/Meta-Llama-3.1-8B-FP8",`
`269`	`269`	`TestAttentionFp8StaticQuantPatternModel,`
`270`	`270`	`)`
`271`	`271`	`]`
`272`	`272`	`PATTERN_TEST_MODELS_FP4 = [`
`273`	`273`	`(`
`274`		`- "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",`
	`274`	`+ "nvidia/Llama-3.1-8B-Instruct-NVFP4",`
`275`	`275`	`TestAttentionNvfp4QuantPatternModel,`
`276`	`276`	`)`
`277`	`277`	`]`
`@@ -331,9 +331,8 @@ def test_attention_quant_pattern(`
`331`	`331`	`if backend == AttentionBackendEnum.FLASHINFER and (`
`332`	`332`	`not current_platform.is_device_capability((10, 0)) or not has_flashinfer()`
`333`	`333`	`):`
	`334`	`+ # This also captures the FP4 case`
`334`	`335`	`pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")`
`335`		`- if "Llama-4-Scout" in model_name and cuda_device_count_stateless() < 2:`
`336`		`- pytest.skip("Llama-4-Scout requires at least 2 GPUs")`
`337`	`336`
`338`	`337`	`custom_ops_list = custom_ops.split(",") if custom_ops else []`
`339`	`338`