Skip to content

Commit e5376bd

Browse files
committed
add tests
Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
1 parent 5efd9bd commit e5376bd

File tree

4 files changed

+12
-3
lines changed

4 files changed

+12
-3
lines changed

tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def pad_and_multiply(scale, tensor):
9090
return ref
9191

9292

93-
def cute_dsl_nvfp4_group_gemm_ref(
93+
def cute_dsl_nvfp4_grouped_gemm_ref(
9494
a: torch.Tensor,
9595
b: torch.Tensor,
9696
a_sf: torch.Tensor,

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1674,13 +1674,15 @@ def test_nvfp4_4gpus_online_eplb(self, fp8kv):
16741674
(False, False, False, True),
16751675
(True, False, True, True), (True, True, True, True)])
16761676
@parametrize_with_ids("mtp_nextn", [0, 2])
1677-
@parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM"])
1677+
@parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"])
16781678
def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
16791679
torch_compile, mtp_nextn, moe_backend):
16801680
if moe_backend == "TRTLLM" and (get_sm_version() == 120
16811681
or get_sm_version() == 121):
16821682
pytest.skip(
16831683
"MOE TRTLLM backend does not support SM version 120 or 121")
1684+
if moe_backend == "CUTEDSL" and get_sm_version() != 100:
1685+
pytest.skip(f"{moe_backend} backend supports SM 100 only")
16841686

16851687
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
16861688
torch_compile_config = TorchCompileConfig(
@@ -1767,7 +1769,7 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph,
17671769
(2, 2, 1), (1, 4, 1)],
17681770
ids=["tp4", "ep4", "tp2pp2", "pp4"])
17691771
@parametrize_with_ids("mtp_nextn", [0, 2])
1770-
@parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM"])
1772+
@parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"])
17711773
def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
17721774
overlap_scheduler, tp_size, pp_size, ep_size,
17731775
torch_compile, mtp_nextn, moe_backend):
@@ -1777,6 +1779,9 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
17771779
or get_sm_version() == 121):
17781780
pytest.skip(
17791781
"MOE TRTLLM backend does not support SM version 120 or 121")
1782+
if moe_backend == "CUTEDSL" and get_sm_version() != 100:
1783+
pytest.skip(f"{moe_backend} backend supports SM 100 only")
1784+
17801785
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
17811786
# Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
17821787
torch_compile_config = TorchCompileConfig(

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ l0_b200:
2727
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
2828
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] ISOLATION
2929
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
30+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTEDSL-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
3031
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
3132
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
3233
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True]
@@ -146,4 +147,5 @@ l0_b200:
146147
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
147148
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
148149
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
150+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTEDSL-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
149151
- accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ l0_dgx_b200:
3838
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
3939
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
4040
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
41+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTEDSL-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
4142
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True]
4243
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
4344
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
@@ -167,6 +168,7 @@ l0_dgx_b200:
167168
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
168169
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
169170
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
171+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTEDSL-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
170172
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=0]
171173
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False]
172174
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False]

0 commit comments

Comments
 (0)