Skip to content

Commit d5b7f0c

Browse files
authored
[TRTLLM-8980][test] Clean up spec dec tests in test_llm_api_pytorch (NVIDIA#8889)
Signed-off-by: Mike Iovine <[email protected]> Signed-off-by: Mike Iovine <[email protected]>
1 parent 95049ee commit d5b7f0c

File tree

4 files changed

+28
-30
lines changed

4 files changed

+28
-30
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,12 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
271271

272272
@skip_pre_hopper
273273
def test_ngram(self):
274+
max_bs = 16
275+
274276
pytorch_config = dict(
275277
disable_overlap_scheduler=True,
276-
cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
278+
cuda_graph_config=CudaGraphConfig(
279+
batch_sizes=[i for i in range(1, max_bs + 1)]),
277280
)
278281

279282
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
@@ -291,9 +294,7 @@ def test_ngram(self):
291294
**pytorch_config,
292295
kv_cache_config=kv_cache_config,
293296
speculative_config=spec_config,
294-
max_batch_size=16) as llm:
295-
task = MMLU(self.MODEL_NAME)
296-
task.evaluate(llm)
297+
max_batch_size=max_bs) as llm:
297298
task = GSM8K(self.MODEL_NAME)
298299
task.evaluate(llm)
299300

@@ -600,7 +601,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model):
600601
speculative_model_dir=eagle_model_dir,
601602
eagle3_one_model=eagle3_one_model)
602603
pytorch_config = dict(
603-
disable_overlap_scheduler=True,
604+
disable_overlap_scheduler=not eagle3_one_model,
604605
cuda_graph_config=CudaGraphConfig(max_batch_size=1))
605606
with LLM(model_path,
606607
max_batch_size=16,
@@ -1316,6 +1317,25 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
13161317
task = GSM8K(self.MODEL_NAME)
13171318
task.evaluate(llm)
13181319

1320+
@pytest.mark.skip_less_device_memory(60000)
1321+
def test_bfloat16_2_model_mtp(self):
1322+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
1323+
pytorch_config = dict(
1324+
disable_overlap_scheduler=True,
1325+
cuda_graph_config=CudaGraphConfig(),
1326+
)
1327+
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3,
1328+
mtp_eagle_one_model=False,
1329+
speculative_model_dir=self.MODEL_PATH)
1330+
with LLM(self.MODEL_PATH,
1331+
kv_cache_config=kv_cache_config,
1332+
enable_chunked_prefill=False,
1333+
max_num_tokens=8192,
1334+
**pytorch_config,
1335+
speculative_config=mtp_config) as llm:
1336+
task = GSM8K(self.MODEL_NAME)
1337+
task.evaluate(llm)
1338+
13191339
@pytest.mark.skip_less_device(4)
13201340
@parametrize_with_ids("torch_compile", [False, True])
13211341
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
@@ -3439,31 +3459,6 @@ def test_nvfp4(
34393459
task = GSM8K(self.MODEL_NAME)
34403460
task.evaluate(llm)
34413461

3442-
def test_eagle3(self):
3443-
pytorch_config = dict(
3444-
disable_overlap_scheduler=False,
3445-
cuda_graph_config=CudaGraphConfig(batch_sizes=[1, 2, 3, 4, 8]),
3446-
)
3447-
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
3448-
3449-
eagle_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-30B-eagle3"
3450-
target_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B"
3451-
3452-
draft_len = 1
3453-
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
3454-
speculative_model_dir=eagle_model_dir,
3455-
eagle3_one_model=True)
3456-
3457-
llm = LLM(model=target_model_dir,
3458-
**pytorch_config,
3459-
kv_cache_config=kv_cache_config,
3460-
speculative_config=spec_config,
3461-
max_seq_len=8192)
3462-
3463-
with llm:
3464-
task = GSM8K(self.MODEL_NAME)
3465-
task.evaluate(llm)
3466-
34673462
@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON", "TRTLLM"])
34683463
@pytest.mark.parametrize(
34693464
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,7 @@ accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
458458
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
459459
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
460460
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
461+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp
461462
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
462463
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
463464
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ l0_b200:
2222
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
2323
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
2424
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
25+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp
2526
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
2627
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] ISOLATION
2728
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ l0_h100:
6565
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
6666
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
6767
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
68+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp
6869
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=fp8-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
6970
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True]
7071
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True]

0 commit comments

Comments
 (0)