Skip to content

Commit 6268a60

Browse files
authored
tests: add test_chunked_prefill for llama4 (NVIDIA#5549)
Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com>
1 parent d974198 commit 6268a60

File tree

3 files changed

+32
-2
lines changed

3 files changed

+32
-2
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,23 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
413413
task = GSM8K(self.MODEL_NAME)
414414
task.evaluate(llm)
415415

416+
@skip_pre_blackwell
417+
@pytest.mark.skip_less_device(8)
418+
@parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
419+
def test_chunked_prefill(self, attn_backend):
420+
pytorch_config = dict(attn_backend=attn_backend,
421+
disable_overlap_scheduler=True)
422+
with LLM(self.MODEL_PATH,
423+
tensor_parallel_size=8,
424+
pipeline_parallel_size=1,
425+
moe_expert_parallel_size=1,
426+
max_seq_len=8192,
427+
enable_chunked_prefill=True,
428+
max_num_tokens=256,
429+
**pytorch_config) as llm:
430+
task = MMLU(self.MODEL_NAME)
431+
task.evaluate(llm)
432+
416433

417434
class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
418435
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

tests/integration/test_lists/qa/examples_test_list.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,8 @@ accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2
383383
accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_rowwise
384384
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
385385
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
386+
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
387+
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
386388
accuracy/test_cli_flow.py::TestMistral7B::test_beam_search
387389
accuracy/test_cli_flow.py::TestMistral7B::test_fp8_tp4pp2
388390
accuracy/test_cli_flow.py::TestMistral7B::test_smooth_quant_tp4pp1
@@ -435,6 +437,8 @@ accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
435437
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
436438
accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2
437439
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
440+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
441+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
438442
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
439443
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
440444
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
@@ -445,13 +449,13 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_
445449
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
446450
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
447451
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
448-
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
449-
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
450452
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
451453
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
452454
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
453455
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
454456
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
457+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
458+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
455459
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
456460
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
457461
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]

tests/integration/test_lists/qa/llm_sanity_test.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype
22
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
33
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
44
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
5+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
6+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
57
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
68
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
79
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
@@ -18,6 +20,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUT
1820
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
1921
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
2022
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
23+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
2124
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
2225
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
2326
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
@@ -35,9 +38,15 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
3538
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
3639
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
3740
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
41+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
42+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
3843
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
3944
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
4045
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
46+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
47+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
48+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
49+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
4150
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
4251
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
4352
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized

0 commit comments

Comments
 (0)