diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 16c8184c6ee..0cc34f61e85 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -263,9 +263,12 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model): @skip_pre_hopper def test_ngram(self): + max_bs = 16 + pytorch_config = dict( disable_overlap_scheduler=True, - cuda_graph_config=CudaGraphConfig(batch_sizes=[1]), + cuda_graph_config=CudaGraphConfig( + batch_sizes=[i for i in range(1, max_bs + 1)]), ) kv_cache_config = KvCacheConfig(enable_block_reuse=False, @@ -283,9 +286,7 @@ def test_ngram(self): **pytorch_config, kv_cache_config=kv_cache_config, speculative_config=spec_config, - max_batch_size=16) as llm: - task = MMLU(self.MODEL_NAME) - task.evaluate(llm) + max_batch_size=max_bs) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -592,7 +593,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model): speculative_model_dir=eagle_model_dir, eagle3_one_model=eagle3_one_model) pytorch_config = dict( - disable_overlap_scheduler=True, + disable_overlap_scheduler=not eagle3_one_model, cuda_graph_config=CudaGraphConfig(max_batch_size=1)) with LLM(model_path, max_batch_size=16, @@ -1210,6 +1211,25 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph, task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @pytest.mark.skip_less_device_memory(60000) + def test_bfloat16_2_model_mtp(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) + pytorch_config = dict( + disable_overlap_scheduler=True, + cuda_graph_config=CudaGraphConfig(), + ) + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3, + mtp_eagle_one_model=False, + speculative_model_dir=self.MODEL_PATH) + with LLM(self.MODEL_PATH, + kv_cache_config=kv_cache_config, + enable_chunked_prefill=False, + max_num_tokens=8192, + **pytorch_config, + speculative_config=mtp_config) as llm: + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + @pytest.mark.skip_less_device(4) @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", @@ -1854,14 +1874,19 @@ def test_chunked_prefill(self, quant_dtype, kv_cache_reuse, fp8kv, @parametrize_with_ids("mtp_nextn", [0, pytest.param(2, marks=skip_pre_hopper)]) + @parametrize_with_ids("use_one_model", [False, True]) @pytest.mark.parametrize("backend", ["xgrammar", "llguidance"]) - def test_guided_decoding(self, backend: str, mtp_nextn: int, mocker): + def test_guided_decoding(self, backend: str, mtp_nextn: int, + use_one_model: bool, mocker): mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"}) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) cuda_graph_config = CudaGraphConfig(enable_padding=True) mtp_config = None if mtp_nextn > 0: - mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) + mtp_config = MTPDecodingConfig( + num_nextn_predict_layers=mtp_nextn, + mtp_eagle_one_model=use_one_model, + speculative_model_dir=self.MODEL_PATH) llm = LLM(self.MODEL_PATH, guided_decoding_backend=backend, kv_cache_config=kv_cache_config, @@ -3036,31 +3061,6 @@ def test_nvfp4( task = GSM8K(self.MODEL_NAME) task.evaluate(llm) - def test_eagle3(self): - pytorch_config = dict( - disable_overlap_scheduler=False, - cuda_graph_config=CudaGraphConfig(batch_sizes=[1, 2, 3, 4, 8]), - ) - kv_cache_config = KvCacheConfig(enable_block_reuse=False) - - eagle_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-30B-eagle3" - target_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B" - - draft_len = 1 - spec_config = EagleDecodingConfig(max_draft_len=draft_len, - speculative_model_dir=eagle_model_dir, - eagle3_one_model=True) - - llm = LLM(model=target_model_dir, - **pytorch_config, - kv_cache_config=kv_cache_config, - speculative_config=spec_config, - max_seq_len=8192) - - with llm: - task = GSM8K(self.MODEL_NAME) - task.evaluate(llm) - @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON", "TRTLLM"]) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [ diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 460f1da4930..71f1610282c 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -458,6 +458,7 @@ accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] @@ -467,10 +468,12 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-use_one_model=True-mtp_nextn=0] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-use_one_model=True-mtp_nextn=2] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-use_one_model=True-mtp_nextn=0] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-use_one_model=True-mtp_nextn=2] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-use_one_model=False-mtp_nextn=2] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-use_one_model=False-mtp_nextn=2] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=0] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=0] diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 114717be909..f6810956fb2 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -22,6 +22,7 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] ISOLATION - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] @@ -33,7 +34,8 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-use_one_model=False-mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-use_one_model=True-mtp_nextn=2] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-auto] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 3b7d94d38c3..f5f4a69c84d 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -54,6 +54,7 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=fp8-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True] @@ -242,7 +243,7 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-use_one_model=True-mtp_nextn=0] - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype TIMEOUT (90)