[test] Reorganize TestDeepSeekR1::test_nvfp4_8gpus (#4346)

hlu1 · Hao Lu · web-flow · commit 3ea42e75199a · 2025-05-15T13:09:13.000+08:00
Reorganize TestDeepSeekR1::test_nvfp4_8gpus

Signed-off-by: Hao Lu &lt;14827759+hlu1@users.noreply.github.com@users.noreply.github.com&gt;
Co-authored-by: Hao Lu &lt;14827759+hlu1@users.noreply.github.com@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -571,16 +571,20 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
 
     @pytest.mark.skip_less_device(8)
     @skip_pre_blackwell
-    @parametrize_with_ids("overlap_scheduler", [False, True])
-    @parametrize_with_ids("cuda_graph", [False, True])
-    @parametrize_with_ids("attention_dp", [False, True])
-    @parametrize_with_ids("fp8kv", [False, True])
-    @parametrize_with_ids("mtp_nextn", [0, 2])
-    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
-                                                         (8, 1, 8)],
-                             ids=["tp8", "tp8ep4", "tp8ep8"])
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,batch_size,moe_backend",
+        [
+            (8, 1, 4, 3, False, False, True, True, 1, "CUTLASS"),
+            #TODO: enable mtp after bug fix
+            (8, 1, 4, 0, False, False, True, True, 1, "TRTLLM"),
+            (8, 1, 8, 0, True, True, True, True, 24, "CUTLASS"),
+            (8, 1, 1, 0, True, True, True, True, 24, "CUTLASS"),
+        ],
+        ids=["latency", "latency_trtllmgen", "throughput", "throughput_tp8"])
     def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
-                         attention_dp, cuda_graph, overlap_scheduler):
+                         attention_dp, cuda_graph, overlap_scheduler,
+                         batch_size, moe_backend):
+
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
         pytorch_config = PyTorchConfig(
             enable_overlap_scheduler=overlap_scheduler,
@@ -596,14 +600,16 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         llm = LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4",
+                  batch_size=batch_size,
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
                   kv_cache_config=kv_cache_config,
                   pytorch_backend_config=pytorch_config,
                   quant_config=quant_config,
                   enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
+                  speculative_config=mtp_config,
+                  moe_backend=moe_backend)
         assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
         if fp8kv:
             assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -444,12 +444,10 @@ accuracy/test_llm_api_pytorch.py::TestNemotronSuper::test_auto_dtype_tp2
 accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency_trtllmgen]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[throughput]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[throughput_tp8]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8B::test_auto_dtype[False]