@@ -271,9 +271,12 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
271271
272272 @skip_pre_hopper
273273 def test_ngram (self ):
274+ max_bs = 16
275+
274276 pytorch_config = dict (
275277 disable_overlap_scheduler = True ,
276- cuda_graph_config = CudaGraphConfig (batch_sizes = [1 ]),
278+ cuda_graph_config = CudaGraphConfig (
279+ batch_sizes = [i for i in range (1 , max_bs + 1 )]),
277280 )
278281
279282 kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
@@ -291,9 +294,7 @@ def test_ngram(self):
291294 ** pytorch_config ,
292295 kv_cache_config = kv_cache_config ,
293296 speculative_config = spec_config ,
294- max_batch_size = 16 ) as llm :
295- task = MMLU (self .MODEL_NAME )
296- task .evaluate (llm )
297+ max_batch_size = max_bs ) as llm :
297298 task = GSM8K (self .MODEL_NAME )
298299 task .evaluate (llm )
299300
@@ -600,7 +601,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model):
600601 speculative_model_dir = eagle_model_dir ,
601602 eagle3_one_model = eagle3_one_model )
602603 pytorch_config = dict (
603- disable_overlap_scheduler = True ,
604+ disable_overlap_scheduler = not eagle3_one_model ,
604605 cuda_graph_config = CudaGraphConfig (max_batch_size = 1 ))
605606 with LLM (model_path ,
606607 max_batch_size = 16 ,
@@ -1316,6 +1317,25 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
13161317 task = GSM8K (self .MODEL_NAME )
13171318 task .evaluate (llm )
13181319
1320+ @pytest .mark .skip_less_device_memory (60000 )
1321+ def test_bfloat16_2_model_mtp (self ):
1322+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
1323+ pytorch_config = dict (
1324+ disable_overlap_scheduler = True ,
1325+ cuda_graph_config = CudaGraphConfig (),
1326+ )
1327+ mtp_config = MTPDecodingConfig (num_nextn_predict_layers = 3 ,
1328+ mtp_eagle_one_model = False ,
1329+ speculative_model_dir = self .MODEL_PATH )
1330+ with LLM (self .MODEL_PATH ,
1331+ kv_cache_config = kv_cache_config ,
1332+ enable_chunked_prefill = False ,
1333+ max_num_tokens = 8192 ,
1334+ ** pytorch_config ,
1335+ speculative_config = mtp_config ) as llm :
1336+ task = GSM8K (self .MODEL_NAME )
1337+ task .evaluate (llm )
1338+
13191339 @pytest .mark .skip_less_device (4 )
13201340 @parametrize_with_ids ("torch_compile" , [False , True ])
13211341 @parametrize_with_ids ("attention_dp,cuda_graph,overlap_scheduler" ,
@@ -3439,31 +3459,6 @@ def test_nvfp4(
34393459 task = GSM8K (self .MODEL_NAME )
34403460 task .evaluate (llm )
34413461
3442- def test_eagle3 (self ):
3443- pytorch_config = dict (
3444- disable_overlap_scheduler = False ,
3445- cuda_graph_config = CudaGraphConfig (batch_sizes = [1 , 2 , 3 , 4 , 8 ]),
3446- )
3447- kv_cache_config = KvCacheConfig (enable_block_reuse = False )
3448-
3449- eagle_model_dir = f"{ llm_models_root ()} /Qwen3/Qwen3-30B-eagle3"
3450- target_model_dir = f"{ llm_models_root ()} /Qwen3/Qwen3-30B-A3B"
3451-
3452- draft_len = 1
3453- spec_config = EagleDecodingConfig (max_draft_len = draft_len ,
3454- speculative_model_dir = eagle_model_dir ,
3455- eagle3_one_model = True )
3456-
3457- llm = LLM (model = target_model_dir ,
3458- ** pytorch_config ,
3459- kv_cache_config = kv_cache_config ,
3460- speculative_config = spec_config ,
3461- max_seq_len = 8192 )
3462-
3463- with llm :
3464- task = GSM8K (self .MODEL_NAME )
3465- task .evaluate (llm )
3466-
34673462 @pytest .mark .parametrize ("moe_backend" , ["CUTLASS" , "TRITON" , "TRTLLM" ])
34683463 @pytest .mark .parametrize (
34693464 "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler" , [
0 commit comments