|
3 | 3 |
|
4 | 4 | import json
|
5 | 5 | import os
|
| 6 | +from typing import Optional |
6 | 7 |
|
7 | 8 | import pytest
|
8 | 9 |
|
|
20 | 21 | dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
|
21 | 22 |
|
22 | 23 |
|
23 |
| -def can_initialize(model: str, extra_args: list[str]): |
| 24 | +def can_initialize(model: str, extra_args: Optional[list[str]] = None): |
24 | 25 |
|
25 | 26 | # Server arguments
|
| 27 | + extra_args = extra_args if extra_args is not None else [] |
26 | 28 | server_args = [
|
27 | 29 | "--max-model-len",
|
28 | 30 | "2048",
|
@@ -65,68 +67,84 @@ def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
|
65 | 67 | monkeypatch: pytest.MonkeyPatch):
|
66 | 68 | monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
67 | 69 | monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
68 |
| - can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", []) |
| 70 | + can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8") |
69 | 71 |
|
70 | 72 |
|
71 | 73 | @pytest.mark.skip(reason="Works, but takes too long to run")
|
72 | 74 | def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
|
73 | 75 | monkeypatch: pytest.MonkeyPatch):
|
74 | 76 | monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
75 | 77 | monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
76 |
| - can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", []) |
| 78 | + can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8") |
77 | 79 |
|
78 | 80 |
|
79 | 81 | @pytest.mark.skip(reason="Works, but takes too long to run")
|
80 | 82 | def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
81 | 83 | monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
82 | 84 | monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
83 |
| - can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", []) |
| 85 | + can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4") |
84 | 86 |
|
85 | 87 |
|
86 | 88 | @pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
|
87 | 89 | def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
88 | 90 | monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
89 | 91 | monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
90 |
| - can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", []) |
| 92 | + can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4") |
91 | 93 |
|
92 | 94 |
|
93 | 95 | ## DeepSeekV3 ##
|
94 | 96 |
|
95 | 97 |
|
96 | 98 | def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
|
97 | 99 | monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
|
98 |
| - can_initialize("deepseek-ai/DeepSeek-V3.1", []) |
| 100 | + can_initialize("deepseek-ai/DeepSeek-V3.1") |
| 101 | + |
| 102 | + |
| 103 | +@pytest.mark.skip(reason=("Known issue: lack of kernel support. " |
| 104 | + "Expected failure: assert self.block_quant is None")) |
| 105 | +def test_deepseek_fp8_block_moe_flashinfer_cutlass( |
| 106 | + monkeypatch: pytest.MonkeyPatch): |
| 107 | + monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1") |
| 108 | + monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput") |
| 109 | + can_initialize("deepseek-ai/DeepSeek-V3.1") |
| 110 | + |
| 111 | + |
| 112 | +def test_deepseek_fp8_block_moe_flashinfer_trtllm( |
| 113 | + monkeypatch: pytest.MonkeyPatch): |
| 114 | + monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1") |
| 115 | + monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency") |
| 116 | + can_initialize("deepseek-ai/DeepSeek-V3.1") |
99 | 117 |
|
100 | 118 |
|
101 | 119 | def test_deepseek_nvfp4_moe_flashinfer_cutlass(
|
102 | 120 | monkeypatch: pytest.MonkeyPatch):
|
103 | 121 | monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
104 | 122 | monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
105 |
| - can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", []) |
| 123 | + can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2") |
106 | 124 |
|
107 | 125 |
|
108 | 126 | @pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
|
109 | 127 | def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
110 | 128 | monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
111 | 129 | monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
112 |
| - can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", []) |
| 130 | + can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2") |
113 | 131 |
|
114 | 132 |
|
115 | 133 | ## GPT-OSS ##
|
116 | 134 |
|
117 | 135 |
|
118 | 136 | def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
|
119 | 137 | monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
|
120 |
| - can_initialize("openai/gpt-oss-20b", []) |
| 138 | + can_initialize("openai/gpt-oss-20b") |
121 | 139 |
|
122 | 140 |
|
123 | 141 | def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(
|
124 | 142 | monkeypatch: pytest.MonkeyPatch):
|
125 | 143 | monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
|
126 |
| - can_initialize("openai/gpt-oss-20b", []) |
| 144 | + can_initialize("openai/gpt-oss-20b") |
127 | 145 |
|
128 | 146 |
|
129 | 147 | def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(
|
130 | 148 | monkeypatch: pytest.MonkeyPatch):
|
131 | 149 | monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
|
132 |
| - can_initialize("openai/gpt-oss-20b", []) |
| 150 | + can_initialize("openai/gpt-oss-20b") |
0 commit comments