Skip to content

Commit 3b279a8

Browse files
authored
[CI] Add Blackwell DeepSeek FP8 FlashInfer MoE tests (vllm-project#26040)
Signed-off-by: mgoin <[email protected]>
1 parent 5e4a822 commit 3b279a8

File tree

1 file changed

+29
-11
lines changed

1 file changed

+29
-11
lines changed

tests/quantization/test_blackwell_moe.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import json
55
import os
6+
from typing import Optional
67

78
import pytest
89

@@ -20,9 +21,10 @@
2021
dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
2122

2223

23-
def can_initialize(model: str, extra_args: list[str]):
24+
def can_initialize(model: str, extra_args: Optional[list[str]] = None):
2425

2526
# Server arguments
27+
extra_args = extra_args if extra_args is not None else []
2628
server_args = [
2729
"--max-model-len",
2830
"2048",
@@ -65,68 +67,84 @@ def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
6567
monkeypatch: pytest.MonkeyPatch):
6668
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
6769
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
68-
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
70+
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
6971

7072

7173
@pytest.mark.skip(reason="Works, but takes too long to run")
7274
def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
7375
monkeypatch: pytest.MonkeyPatch):
7476
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
7577
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
76-
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
78+
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
7779

7880

7981
@pytest.mark.skip(reason="Works, but takes too long to run")
8082
def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
8183
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
8284
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
83-
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
85+
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
8486

8587

8688
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
8789
def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
8890
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
8991
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
90-
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
92+
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
9193

9294

9395
## DeepSeekV3 ##
9496

9597

9698
def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
9799
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
98-
can_initialize("deepseek-ai/DeepSeek-V3.1", [])
100+
can_initialize("deepseek-ai/DeepSeek-V3.1")
101+
102+
103+
@pytest.mark.skip(reason=("Known issue: lack of kernel support. "
104+
"Expected failure: assert self.block_quant is None"))
105+
def test_deepseek_fp8_block_moe_flashinfer_cutlass(
106+
monkeypatch: pytest.MonkeyPatch):
107+
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
108+
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
109+
can_initialize("deepseek-ai/DeepSeek-V3.1")
110+
111+
112+
def test_deepseek_fp8_block_moe_flashinfer_trtllm(
113+
monkeypatch: pytest.MonkeyPatch):
114+
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
115+
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
116+
can_initialize("deepseek-ai/DeepSeek-V3.1")
99117

100118

101119
def test_deepseek_nvfp4_moe_flashinfer_cutlass(
102120
monkeypatch: pytest.MonkeyPatch):
103121
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
104122
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
105-
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
123+
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
106124

107125

108126
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
109127
def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
110128
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
111129
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
112-
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
130+
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
113131

114132

115133
## GPT-OSS ##
116134

117135

118136
def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
119137
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
120-
can_initialize("openai/gpt-oss-20b", [])
138+
can_initialize("openai/gpt-oss-20b")
121139

122140

123141
def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(
124142
monkeypatch: pytest.MonkeyPatch):
125143
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
126-
can_initialize("openai/gpt-oss-20b", [])
144+
can_initialize("openai/gpt-oss-20b")
127145

128146

129147
def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(
130148
monkeypatch: pytest.MonkeyPatch):
131149
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
132-
can_initialize("openai/gpt-oss-20b", [])
150+
can_initialize("openai/gpt-oss-20b")

0 commit comments

Comments
 (0)