Skip to content

Commit c1cfb61

Browse files
authored
[TRTLLM-9381][feat] Add kimi k2 fp4 tests (#9906)
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent 50c2b82 commit c1cfb61

File tree

8 files changed

+46
-6
lines changed

8 files changed

+46
-6
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,10 @@ Qwen3/Qwen3-Next-80B-A3B-Instruct:
146146
moonshotai/Kimi-K2-Instruct:
147147
- quant_algo: FP8_BLOCK_SCALES
148148
accuracy: 94.84
149+
moonshotai/Kimi-K2-Thinking:
150+
- quant_algo: NVFP4
151+
kv_cache_quant_algo: FP8
152+
accuracy: 90.84
149153
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
150154
- accuracy: 92.57
151155
- quant_algo: FP8

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,10 @@ Qwen3/Qwen3-Next-80B-A3B-Instruct:
250250
moonshotai/Kimi-K2-Instruct:
251251
- quant_algo: FP8_BLOCK_SCALES
252252
accuracy: 87.65
253+
moonshotai/Kimi-K2-Thinking:
254+
- quant_algo: NVFP4
255+
kv_cache_quant_algo: FP8
256+
accuracy: 85.83
253257
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
254258
- accuracy: 79.43
255259
- quant_algo: FP8

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2999,6 +2999,35 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, fp8kv,
29992999
task = GSM8K(self.MODEL_NAME)
30003000
task.evaluate(llm)
30013001

3002+
@skip_pre_blackwell
3003+
@pytest.mark.timeout(7200)
3004+
@pytest.mark.skip_less_device_memory(120000)
3005+
@pytest.mark.parametrize("tp_size", [
3006+
pytest.param(4, marks=pytest.mark.skip_less_device(4)),
3007+
pytest.param(8, marks=pytest.mark.skip_less_device(8)),
3008+
],
3009+
ids=["4gpus", "8gpus"])
3010+
def test_nvfp4(self, tp_size):
3011+
model_name = "moonshotai/Kimi-K2-Thinking"
3012+
model_path = f"{llm_models_root()}/Kimi-K2-Thinking-NVFP4"
3013+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
3014+
3015+
with LLM(model_path,
3016+
tensor_parallel_size=tp_size,
3017+
max_batch_size=16,
3018+
pipeline_parallel_size=1,
3019+
moe_expert_parallel_size=1,
3020+
kv_cache_config=kv_cache_config,
3021+
enable_attention_dp=True,
3022+
trust_remote_code=True,
3023+
speculative_config=None) as llm:
3024+
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
3025+
3026+
task = MMLU(model_name)
3027+
task.evaluate(llm)
3028+
task = GSM8K(model_name)
3029+
task.evaluate(llm)
3030+
30023031

30033032
class TestMinitron4BBaseInstruct(LlmapiAccuracyTestHarness):
30043033
MODEL_NAME = "nvidia/Nemotron-Mini-4B-Instruct"

tests/integration/defs/test_e2e.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3155,13 +3155,12 @@ def test_ptp_quickstart_advanced_llama_multi_nodes(llm_root, llm_venv,
31553155
pytest.param('Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf',
31563156
marks=skip_pre_blackwell),
31573157
pytest.param('DeepSeek-R1/DeepSeek-R1-0528-FP4', marks=skip_pre_blackwell),
3158-
pytest.param('Kimi-K2-Instruct',
3159-
marks=(skip_pre_hopper, skip_post_blackwell)),
3158+
pytest.param('Kimi-K2-Thinking-NVFP4', marks=skip_pre_blackwell),
31603159
pytest.param('nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1',
31613160
marks=skip_pre_hopper),
31623161
])
3163-
def test_multi_nodes_eval(llm_venv, model_path, tp_size, pp_size, ep_size,
3164-
eval_task, mmlu_dataset_root):
3162+
def test_multi_nodes_eval(model_path, tp_size, pp_size, ep_size, eval_task,
3163+
mmlu_dataset_root):
31653164
if "Llama-4" in model_path and tp_size == 16:
31663165
pytest.skip("Llama-4 with tp16 is not supported")
31673166

@@ -3176,6 +3175,7 @@ def test_multi_nodes_eval(llm_venv, model_path, tp_size, pp_size, ep_size,
31763175
f"--pp_size={pp_size}",
31773176
f"--kv_cache_free_gpu_memory_fraction={_MEM_FRACTION_80}",
31783177
"--max_batch_size=32",
3178+
"--enable_attention_dp",
31793179
"--backend=pytorch",
31803180
]
31813181

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,8 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_tr
629629
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
630630
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
631631
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
632+
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
633+
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
632634
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
633635
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
634636
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype

tests/integration/test_lists/qa/llm_function_core_sanity.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model
120120
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
121121
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
122122
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
123+
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
124+
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
123125
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
124126
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
125127
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]

tests/integration/test_lists/qa/llm_function_multinode.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ test_e2e.py::test_multi_nodes_eval[llama4-models/nvidia/Llama-4-Maverick-17B-128
77
test_e2e.py::test_multi_nodes_eval[Qwen3/Qwen3-235B-A22B-tp16-mmlu]
88
test_e2e.py::test_multi_nodes_eval[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-tp16-mmlu]
99
test_e2e.py::test_multi_nodes_eval[DeepSeek-R1/DeepSeek-R1-0528-FP4-tp16-mmlu]
10-
test_e2e.py::test_multi_nodes_eval[Kimi-K2-Instruct-tp16-mmlu]
10+
test_e2e.py::test_multi_nodes_eval[Kimi-K2-Thinking-NVFP4-tp16-mmlu]
1111
test_e2e.py::test_multi_nodes_eval[nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-tp16-mmlu]
1212
test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp2pp1-gen_tp2pp1]
1313
test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp1pp2-gen_tp1pp2]

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,6 @@ examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_t
268268
examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] SKIP (https://nvbugs/5546507)
269269
cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689)
270270
cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689)
271-
test_e2e.py::test_multi_nodes_eval[Kimi-K2-Instruct-tp16-mmlu] SKIP (https://nvbugs/5556998)
272271
full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
273272
full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553)
274273
full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] SKIP (https://nvbugs/5574553)

0 commit comments

Comments
 (0)