Skip to content

Commit 6c3210a

Browse files
authored
[test] add nvfp4 DeepSeek-V3-Lite-mtp tests (NVIDIA#5125)
Signed-off-by: Fanrong Li <[email protected]>
1 parent 6a388b1 commit 6c3210a

File tree

10 files changed

+108
-44
lines changed

10 files changed

+108
-44
lines changed

tests/integration/defs/.test_durations

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -289,8 +289,8 @@
289289
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 220.57452515885234,
290290
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]": 202.22269394202158,
291291
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]": 165.08514453098178,
292-
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]": 252.70569713797886,
293-
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]": 85.24235329206567,
292+
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]": 252.70569713797886,
293+
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]": 85.24235329206567,
294294
"test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]": 81.43792725296225,
295295
"test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B]": 109.26379436196294,
296296
"test_e2e.py::test_ptp_quickstart_advanced_mixed_precision": 80.88908524392173,
@@ -338,9 +338,9 @@
338338
"accuracy/test_cli_flow.py::TestLlama3_2_1B::test_smooth_quant_ootb_manage_weights": 216.66169160604477,
339339
"accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only_manage_weights[int4]": 161.57166086137295,
340340
"test_unittests.py::test_unittests_v2[unittest/llmapi/test_llm.py -m \"not part0\"]": 1883.5484512336552,
341-
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 89.92349556891713,
342-
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp-cuda_graph-overlap_scheduler-torch_compile=False]": 175.661773331929,
343-
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 90.21807348495349,
341+
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 89.92349556891713,
342+
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]": 175.661773331929,
343+
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 90.21807348495349,
344344
"accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4": 56.31924073398113,
345345
"test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]": 56.05445321695879,
346346
"test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]": 114.17938271397725,

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@ deepseek-ai/DeepSeek-V3-Lite:
2323
- quant_algo: NVFP4
2424
kv_cache_quant_algo: FP8
2525
accuracy: 63.71
26+
- quant_algo: NVFP4
27+
spec_dec_algo: MTP
28+
accuracy: 63.71
29+
- quant_algo: NVFP4
30+
kv_cache_quant_algo: FP8
31+
spec_dec_algo: MTP
32+
accuracy: 63.71
2633
- quant_algo: FP8_BLOCK_SCALES
2734
accuracy: 64.74
2835
- quant_algo: FP8_BLOCK_SCALES

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ deepseek-ai/DeepSeek-V3-Lite:
115115
- accuracy: 71.40
116116
- quant_algo: NVFP4
117117
accuracy: 70.60
118+
- quant_algo: NVFP4
119+
spec_dec_algo: MTP
120+
accuracy: 70.60
118121
- quant_algo: FP8_BLOCK_SCALES
119122
accuracy: 71.27
120123
- spec_dec_algo: MTP

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -798,8 +798,12 @@ def test_fp8_block_scales_4gpus_static_eplb(self):
798798
(False, False, True, False),
799799
(False, False, False, True),
800800
(True, False, True, True), (True, True, True, True)])
801+
@parametrize_with_ids("mtp_nextn", [0, 2])
802+
@parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM"])
801803
def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
802-
torch_compile):
804+
torch_compile, mtp_nextn, moe_backend):
805+
if torch_compile and mtp_nextn > 0:
806+
pytest.skip("https://nvbugs/5252313")
803807
if torch_compile and attention_dp:
804808
pytest.skip("https://nvbugs/5252559")
805809
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
@@ -810,18 +814,24 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
810814
disable_overlap_scheduler=not overlap_scheduler,
811815
use_cuda_graph=cuda_graph,
812816
torch_compile_config=torch_compile_config,
817+
moe_backend=moe_backend,
813818
)
819+
mtp_config = None
820+
if mtp_nextn > 0:
821+
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
822+
814823
quant_config = QuantConfig()
815824
quant_config.quant_algo = QuantAlgo.NVFP4
816825
if fp8kv:
817826
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
818827
pytorch_config["kv_cache_dtype"] = "fp8"
819828

820-
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
829+
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp",
821830
kv_cache_config=kv_cache_config,
822831
**pytorch_config,
823832
quant_config=quant_config,
824-
enable_attention_dp=attention_dp)
833+
enable_attention_dp=attention_dp,
834+
speculative_config=mtp_config)
825835

826836
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
827837
if fp8kv:
@@ -850,9 +860,13 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
850860
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
851861
(2, 2, 1), (1, 4, 1)],
852862
ids=["tp4", "ep4", "tp2pp2", "pp4"])
863+
@parametrize_with_ids("mtp_nextn", [0, 2])
864+
@parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM"])
853865
def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
854866
overlap_scheduler, tp_size, pp_size, ep_size,
855-
torch_compile):
867+
torch_compile, mtp_nextn, moe_backend):
868+
if torch_compile and mtp_nextn > 0:
869+
pytest.skip("https://nvbugs/5252313")
856870
if torch_compile and attention_dp:
857871
pytest.skip("https://nvbugs/5252559")
858872
if torch_compile and pp_size > 1:
@@ -867,22 +881,28 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
867881
disable_overlap_scheduler=not overlap_scheduler,
868882
use_cuda_graph=cuda_graph,
869883
torch_compile_config=torch_compile_config,
884+
moe_backend=moe_backend,
870885
)
871886

887+
mtp_config = None
888+
if mtp_nextn > 0:
889+
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
890+
872891
quant_config = QuantConfig()
873892
quant_config.quant_algo = QuantAlgo.NVFP4
874893
if fp8kv:
875894
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
876895
pytorch_config["kv_cache_dtype"] = "fp8"
877896

878-
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
897+
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp",
879898
tensor_parallel_size=tp_size,
880899
pipeline_parallel_size=pp_size,
881900
moe_expert_parallel_size=ep_size,
882901
kv_cache_config=kv_cache_config,
883902
**pytorch_config,
884903
quant_config=quant_config,
885-
enable_attention_dp=attention_dp)
904+
enable_attention_dp=attention_dp,
905+
speculative_config=mtp_config)
886906

887907
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
888908
if fp8kv:

tests/integration/test_lists/qa/examples_test_list.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,8 @@ accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
459459
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
460460
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
461461
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
462-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
462+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
463+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
463464
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
464465
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
465466
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2

tests/integration/test_lists/qa/llm_release_rtx_pro_6000.txt

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,16 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-
1111
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
1212
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
1313
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
14-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
15-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
16-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
17-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
18-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
19-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
14+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
15+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
16+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
17+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
18+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
19+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
20+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
21+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
22+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
23+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
2024
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
2125
test_e2e.py::test_ptp_quickstart_advanced_mixed_precision
2226
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]

tests/integration/test_lists/qa/llm_sanity_test.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-c
128128
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized
129129
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
130130
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
131-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
131+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
132+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
132133
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
133134
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
134135
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,23 @@ l0_b200:
2929
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
3030
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
3131
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
32-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
33-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
34-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
35-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
36-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
37-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
38-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
39-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
40-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
32+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
33+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
34+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
35+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
36+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
37+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
38+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
39+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
40+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
41+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
42+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
43+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
44+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
45+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
46+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
47+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
48+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
4149
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
4250
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
4351
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]

0 commit comments

Comments
 (0)