@@ -2999,6 +2999,35 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, fp8kv,
29992999 task = GSM8K (self .MODEL_NAME )
30003000 task .evaluate (llm )
30013001
3002+ @skip_pre_blackwell
3003+ @pytest .mark .timeout (7200 )
3004+ @pytest .mark .skip_less_device_memory (120000 )
3005+ @pytest .mark .parametrize ("tp_size" , [
3006+ pytest .param (4 , marks = pytest .mark .skip_less_device (4 )),
3007+ pytest .param (8 , marks = pytest .mark .skip_less_device (8 )),
3008+ ],
3009+ ids = ["4gpus" , "8gpus" ])
3010+ def test_nvfp4 (self , tp_size ):
3011+ model_name = "moonshotai/Kimi-K2-Thinking"
3012+ model_path = f"{ llm_models_root ()} /Kimi-K2-Thinking-NVFP4"
3013+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 )
3014+
3015+ with LLM (model_path ,
3016+ tensor_parallel_size = tp_size ,
3017+ max_batch_size = 16 ,
3018+ pipeline_parallel_size = 1 ,
3019+ moe_expert_parallel_size = 1 ,
3020+ kv_cache_config = kv_cache_config ,
3021+ enable_attention_dp = True ,
3022+ trust_remote_code = True ,
3023+ speculative_config = None ) as llm :
3024+ assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
3025+
3026+ task = MMLU (model_name )
3027+ task .evaluate (llm )
3028+ task = GSM8K (model_name )
3029+ task .evaluate (llm )
3030+
30023031
30033032class TestMinitron4BBaseInstruct (LlmapiAccuracyTestHarness ):
30043033 MODEL_NAME = "nvidia/Nemotron-Mini-4B-Instruct"
0 commit comments