@@ -3461,7 +3461,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
34613461
34623462 MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b"
34633463
3464- @pytest .mark .skip (reason = "https://nvbugs/5596343" )
34653464 @pytest .mark .parametrize (
34663465 "kv_cache_dtype" ,
34673466 ["auto" , pytest .param ("fp8" , marks = skip_pre_blackwell )])
@@ -3535,35 +3534,66 @@ def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
35353534 pytest .skip (
35363535 "https://nvbugs/5596343: Skip Hopper due to accuracy issue." )
35373536
3538- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
3539- mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
3540- {"scores_filter" : "exact_match,flexible-extract" })
35413537 if moe_backend == "TRITON" :
35423538 if not IS_TRITON_KERNELS_AVAILABLE :
35433539 pytest .skip ("Triton kernels are not available" )
35443540
3541+ MAX_OUTPUT_LEN = 128179
3542+ MAX_INPUT_LEN = 32768
3543+
3544+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
3545+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
3546+ {"scores_filter" : "exact_match,flexible-extract" })
3547+
3548+ mocker .patch .object (GPQADiamond , "MAX_OUTPUT_LEN" , MAX_OUTPUT_LEN )
3549+ mocker .patch .object (GPQADiamond , "MAX_INPUT_LEN" , MAX_INPUT_LEN )
3550+
35453551 pytorch_config = dict (
35463552 disable_overlap_scheduler = not overlap_scheduler ,
35473553 cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
35483554
3549- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 ,
3555+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
35503556 dtype = kv_cache_dtype )
35513557
3558+ max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
35523559 llm = LLM (self .MODEL_PATH ,
35533560 tensor_parallel_size = tp_size ,
35543561 pipeline_parallel_size = pp_size ,
35553562 moe_expert_parallel_size = ep_size ,
35563563 kv_cache_config = kv_cache_config ,
3564+ max_seq_len = max_seq_len ,
3565+ max_batch_size = 720 ,
35573566 ** pytorch_config ,
35583567 enable_attention_dp = attention_dp ,
35593568 moe_config = MoeConfig (backend = moe_backend ))
35603569
35613570 with llm :
35623571 model_name = "GPT-OSS/120B-MXFP4"
3572+
3573+ # GSM8K
35633574 task = GSM8K (model_name )
35643575 task .evaluate (llm ,
35653576 extra_evaluator_kwargs = self .extra_evaluator_kwargs )
35663577
3578+ # GPQA Medium Reasoning
3579+ task = GPQADiamond (model_name )
3580+
3581+ chat_template_kwargs = dict (reasoning_effort = "medium" )
3582+ extra_evaluator_kwargs = {
3583+ ** self .extra_evaluator_kwargs , "chat_template_kwargs" :
3584+ chat_template_kwargs
3585+ }
3586+
3587+ sampling_params = SamplingParams (
3588+ temperature = 1.0 ,
3589+ top_p = 1.0 ,
3590+ max_tokens = MAX_OUTPUT_LEN ,
3591+ truncate_prompt_tokens = MAX_INPUT_LEN )
3592+
3593+ task .evaluate (llm ,
3594+ sampling_params = sampling_params ,
3595+ extra_evaluator_kwargs = extra_evaluator_kwargs )
3596+
35673597 @pytest .mark .skip_less_device (8 )
35683598 @pytest .mark .parametrize (
35693599 "moe_backend" ,
@@ -3600,7 +3630,7 @@ def test_w4_8gpus(self, moe_backend, tp_size, pp_size, ep_size,
36003630 moe_config = MoeConfig (backend = moe_backend ))
36013631
36023632 with llm :
3603- model_name = "GPT-OSS/MXFP4"
3633+ model_name = "GPT-OSS/120B- MXFP4"
36043634 task = GSM8K (model_name )
36053635 task .evaluate (llm ,
36063636 extra_evaluator_kwargs = self .extra_evaluator_kwargs )
@@ -3708,30 +3738,132 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
37083738 if not IS_TRITON_KERNELS_AVAILABLE :
37093739 pytest .skip ("Triton kernels are not available" )
37103740
3741+ MAX_OUTPUT_LEN = 128179
3742+ MAX_INPUT_LEN = 32768
3743+
3744+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
3745+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
3746+ {"scores_filter" : "exact_match,flexible-extract" })
3747+
3748+ mocker .patch .object (GPQADiamond , "MAX_OUTPUT_LEN" , MAX_OUTPUT_LEN )
3749+ mocker .patch .object (GPQADiamond , "MAX_INPUT_LEN" , MAX_INPUT_LEN )
3750+
37113751 pytorch_config = dict (disable_overlap_scheduler = True ,
37123752 cuda_graph_config = CudaGraphConfig ())
37133753 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 ,
37143754 dtype = kv_cache_dtype )
37153755
3716- model_name = "GPT-OSS/120B-MXFP4"
3717- with LLM (self .MODEL_PATH ,
3718- tensor_parallel_size = 4 ,
3719- pipeline_parallel_size = 1 ,
3720- moe_expert_parallel_size = 1 ,
3721- kv_cache_config = kv_cache_config ,
3722- max_seq_len = 8192 ,
3723- max_num_tokens = 512 ,
3724- enable_chunked_prefill = True ,
3725- enable_attention_dp = False ,
3726- moe_config = MoeConfig (backend = moe_backend ),
3727- ** pytorch_config ) as llm :
3728- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
3729- mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
3730- {"scores_filter" : "exact_match,flexible-extract" })
3756+ max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
3757+ llm = LLM (self .MODEL_PATH ,
3758+ tensor_parallel_size = 4 ,
3759+ pipeline_parallel_size = 1 ,
3760+ moe_expert_parallel_size = 1 ,
3761+ kv_cache_config = kv_cache_config ,
3762+ max_seq_len = max_seq_len ,
3763+ max_num_tokens = 512 ,
3764+ enable_chunked_prefill = True ,
3765+ enable_attention_dp = False ,
3766+ moe_config = MoeConfig (backend = moe_backend ),
3767+ ** pytorch_config )
3768+ with llm :
3769+ model_name = "GPT-OSS/120B-MXFP4"
3770+
3771+ # GSM8K
3772+ task = GSM8K (model_name )
3773+ task .evaluate (llm ,
3774+ extra_evaluator_kwargs = self .extra_evaluator_kwargs )
3775+
3776+ # GPQA Medium Reasoning
3777+ task = GPQADiamond (model_name )
3778+
3779+ chat_template_kwargs = dict (reasoning_effort = "medium" )
3780+ extra_evaluator_kwargs = {
3781+ ** self .extra_evaluator_kwargs , "chat_template_kwargs" :
3782+ chat_template_kwargs
3783+ }
3784+
3785+ sampling_params = SamplingParams (
3786+ temperature = 1.0 ,
3787+ top_p = 1.0 ,
3788+ max_tokens = MAX_OUTPUT_LEN ,
3789+ truncate_prompt_tokens = MAX_INPUT_LEN )
3790+
3791+ task .evaluate (llm ,
3792+ sampling_params = sampling_params ,
3793+ extra_evaluator_kwargs = extra_evaluator_kwargs )
3794+
3795+ @pytest .mark .skip_less_device (4 )
3796+ @pytest .mark .parametrize (
3797+ "moe_backend" ,
3798+ ["CUTLASS" ,
3799+ pytest .param ("TRTLLM" , marks = skip_pre_blackwell ), "TRITON" ],
3800+ ids = ["cutlass" , "trtllm" , "triton" ])
3801+ def test_eagle3 (self , moe_backend , mocker ):
3802+ if moe_backend == "TRITON" :
3803+ if not IS_TRITON_KERNELS_AVAILABLE :
3804+ pytest .skip ("Triton kernels are not available" )
3805+
3806+ MAX_OUTPUT_LEN = 128179
3807+ MAX_INPUT_LEN = 32768
3808+
3809+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
3810+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
3811+ {"scores_filter" : "exact_match,flexible-extract" })
3812+
3813+ mocker .patch .object (GPQADiamond , "MAX_OUTPUT_LEN" , MAX_OUTPUT_LEN )
3814+ mocker .patch .object (GPQADiamond , "MAX_INPUT_LEN" , MAX_INPUT_LEN )
3815+
3816+ # https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue
3817+ pytorch_config = dict (disable_overlap_scheduler = True ,
3818+ cuda_graph_config = CudaGraphConfig ())
3819+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 ,
3820+ dtype = "auto" )
3821+
3822+ eagle_model_dir = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b-Eagle3"
3823+ draft_len = 3
3824+ spec_config = EagleDecodingConfig (max_draft_len = draft_len ,
3825+ speculative_model_dir = eagle_model_dir ,
3826+ eagle3_one_model = False )
3827+
3828+ max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
3829+ llm = LLM (self .MODEL_PATH ,
3830+ tensor_parallel_size = 4 ,
3831+ pipeline_parallel_size = 1 ,
3832+ moe_expert_parallel_size = 1 ,
3833+ kv_cache_config = kv_cache_config ,
3834+ max_seq_len = max_seq_len ,
3835+ speculative_config = spec_config ,
3836+ ** pytorch_config ,
3837+ enable_attention_dp = False ,
3838+ moe_config = MoeConfig (backend = moe_backend ))
3839+
3840+ with llm :
3841+ model_name = "GPT-OSS/120B-MXFP4"
3842+
3843+ # GSM8K
37313844 task = GSM8K (model_name )
37323845 task .evaluate (llm ,
37333846 extra_evaluator_kwargs = self .extra_evaluator_kwargs )
37343847
3848+ # GPQA Medium Reasoning
3849+ task = GPQADiamond (model_name )
3850+
3851+ chat_template_kwargs = dict (reasoning_effort = "medium" )
3852+ extra_evaluator_kwargs = {
3853+ ** self .extra_evaluator_kwargs , "chat_template_kwargs" :
3854+ chat_template_kwargs
3855+ }
3856+
3857+ sampling_params = SamplingParams (
3858+ temperature = 1.0 ,
3859+ top_p = 1.0 ,
3860+ max_tokens = MAX_OUTPUT_LEN ,
3861+ truncate_prompt_tokens = MAX_INPUT_LEN )
3862+
3863+ task .evaluate (llm ,
3864+ sampling_params = sampling_params ,
3865+ extra_evaluator_kwargs = extra_evaluator_kwargs )
3866+
37353867
37363868@skip_pre_hopper
37373869class TestEXAONE4 (LlmapiAccuracyTestHarness ):
0 commit comments