|
18 | 18 | is_blackwell, |
19 | 19 | run_model, |
20 | 20 | ) |
21 | | -from tests.utils import cuda_device_count_stateless, flat_product |
| 21 | +from tests.utils import flat_product |
22 | 22 | from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata |
23 | 23 | from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant |
24 | 24 | from vllm.attention.layer import Attention |
@@ -265,13 +265,13 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor): |
265 | 265 | HEADS = [(64, 8), (40, 8)] |
266 | 266 | PATTERN_TEST_MODELS_FP8 = [ |
267 | 267 | ( |
268 | | - "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", |
| 268 | + "RedHatAI/Meta-Llama-3.1-8B-FP8", |
269 | 269 | TestAttentionFp8StaticQuantPatternModel, |
270 | 270 | ) |
271 | 271 | ] |
272 | 272 | PATTERN_TEST_MODELS_FP4 = [ |
273 | 273 | ( |
274 | | - "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", |
| 274 | + "nvidia/Llama-3.1-8B-Instruct-NVFP4", |
275 | 275 | TestAttentionNvfp4QuantPatternModel, |
276 | 276 | ) |
277 | 277 | ] |
@@ -331,9 +331,8 @@ def test_attention_quant_pattern( |
331 | 331 | if backend == AttentionBackendEnum.FLASHINFER and ( |
332 | 332 | not current_platform.is_device_capability((10, 0)) or not has_flashinfer() |
333 | 333 | ): |
| 334 | + # This also captures the FP4 case |
334 | 335 | pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer") |
335 | | - if "Llama-4-Scout" in model_name and cuda_device_count_stateless() < 2: |
336 | | - pytest.skip("Llama-4-Scout requires at least 2 GPUs") |
337 | 336 |
|
338 | 337 | custom_ops_list = custom_ops.split(",") if custom_ops else [] |
339 | 338 |
|
|
0 commit comments