Skip to content

Commit 6d6797c

Browse files
authored
[None][test] Enhance GPT-OSS CI with GPQA Diamond and additional Spec Decoding Test (#8661)
Signed-off-by: Dongfeng Yu <[email protected]> Signed-off-by: dongfengy <[email protected]>
1 parent f877823 commit 6d6797c

File tree

8 files changed

+196
-21
lines changed

8 files changed

+196
-21
lines changed

tests/integration/defs/accuracy/references/gpqa_diamond.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,23 @@ nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
5959
- quant_algo: FP8
6060
kv_cache_quant_algo: FP8
6161
accuracy: 57.07
62+
GPT-OSS/120B-MXFP4:
63+
- accuracy: 65.0
64+
- spec_dec_algo: Eagle
65+
accuracy: 65.0
66+
- quant_algo: W4A8_MXFP4_MXFP8
67+
accuracy: 65.0
68+
- quant_algo: W4A8_MXFP4_MXFP8
69+
spec_dec_algo: Eagle
70+
accuracy: 65.0
71+
- quant_algo: W4A8_MXFP4_MXFP8
72+
kv_cache_quant_algo: FP8
73+
accuracy: 65.0
74+
- quant_algo: W4A16_MXFP4
75+
accuracy: 65.0
76+
- quant_algo: W4A16_MXFP4
77+
spec_dec_algo: Eagle
78+
accuracy: 65.0
79+
- quant_algo: W4A16_MXFP4
80+
kv_cache_quant_algo: FP8
81+
accuracy: 65.0

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,12 @@ GPT-OSS/BF16:
219219
accuracy: 90.3
220220
GPT-OSS/120B-MXFP4:
221221
- accuracy: 90.3
222+
- spec_dec_algo: Eagle
223+
accuracy: 90.3
224+
- quant_algo: W4A8_MXFP4_MXFP8
225+
accuracy: 90.3
222226
- quant_algo: W4A8_MXFP4_MXFP8
227+
spec_dec_algo: Eagle
223228
accuracy: 90.3
224229
- quant_algo: W4A8_MXFP4_MXFP8
225230
kv_cache_quant_algo: FP8
@@ -231,6 +236,9 @@ GPT-OSS/120B-MXFP4:
231236
accuracy: 90.3
232237
- quant_algo: W4A16_MXFP4
233238
accuracy: 90.3
239+
- quant_algo: W4A16_MXFP4
240+
spec_dec_algo: Eagle
241+
accuracy: 90.3
234242
- quant_algo: W4A16_MXFP4
235243
kv_cache_quant_algo: FP8
236244
accuracy: 90.3

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 153 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3461,7 +3461,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
34613461

34623462
MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-120b"
34633463

3464-
@pytest.mark.skip(reason="https://nvbugs/5596343")
34653464
@pytest.mark.parametrize(
34663465
"kv_cache_dtype",
34673466
["auto", pytest.param("fp8", marks=skip_pre_blackwell)])
@@ -3535,35 +3534,66 @@ def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
35353534
pytest.skip(
35363535
"https://nvbugs/5596343: Skip Hopper due to accuracy issue.")
35373536

3538-
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
3539-
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
3540-
{"scores_filter": "exact_match,flexible-extract"})
35413537
if moe_backend == "TRITON":
35423538
if not IS_TRITON_KERNELS_AVAILABLE:
35433539
pytest.skip("Triton kernels are not available")
35443540

3541+
MAX_OUTPUT_LEN = 128179
3542+
MAX_INPUT_LEN = 32768
3543+
3544+
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
3545+
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
3546+
{"scores_filter": "exact_match,flexible-extract"})
3547+
3548+
mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
3549+
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
3550+
35453551
pytorch_config = dict(
35463552
disable_overlap_scheduler=not overlap_scheduler,
35473553
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
35483554

3549-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
3555+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
35503556
dtype=kv_cache_dtype)
35513557

3558+
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
35523559
llm = LLM(self.MODEL_PATH,
35533560
tensor_parallel_size=tp_size,
35543561
pipeline_parallel_size=pp_size,
35553562
moe_expert_parallel_size=ep_size,
35563563
kv_cache_config=kv_cache_config,
3564+
max_seq_len=max_seq_len,
3565+
max_batch_size=720,
35573566
**pytorch_config,
35583567
enable_attention_dp=attention_dp,
35593568
moe_config=MoeConfig(backend=moe_backend))
35603569

35613570
with llm:
35623571
model_name = "GPT-OSS/120B-MXFP4"
3572+
3573+
# GSM8K
35633574
task = GSM8K(model_name)
35643575
task.evaluate(llm,
35653576
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
35663577

3578+
# GPQA Medium Reasoning
3579+
task = GPQADiamond(model_name)
3580+
3581+
chat_template_kwargs = dict(reasoning_effort="medium")
3582+
extra_evaluator_kwargs = {
3583+
**self.extra_evaluator_kwargs, "chat_template_kwargs":
3584+
chat_template_kwargs
3585+
}
3586+
3587+
sampling_params = SamplingParams(
3588+
temperature=1.0,
3589+
top_p=1.0,
3590+
max_tokens=MAX_OUTPUT_LEN,
3591+
truncate_prompt_tokens=MAX_INPUT_LEN)
3592+
3593+
task.evaluate(llm,
3594+
sampling_params=sampling_params,
3595+
extra_evaluator_kwargs=extra_evaluator_kwargs)
3596+
35673597
@pytest.mark.skip_less_device(8)
35683598
@pytest.mark.parametrize(
35693599
"moe_backend",
@@ -3600,7 +3630,7 @@ def test_w4_8gpus(self, moe_backend, tp_size, pp_size, ep_size,
36003630
moe_config=MoeConfig(backend=moe_backend))
36013631

36023632
with llm:
3603-
model_name = "GPT-OSS/MXFP4"
3633+
model_name = "GPT-OSS/120B-MXFP4"
36043634
task = GSM8K(model_name)
36053635
task.evaluate(llm,
36063636
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
@@ -3708,30 +3738,132 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
37083738
if not IS_TRITON_KERNELS_AVAILABLE:
37093739
pytest.skip("Triton kernels are not available")
37103740

3741+
MAX_OUTPUT_LEN = 128179
3742+
MAX_INPUT_LEN = 32768
3743+
3744+
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
3745+
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
3746+
{"scores_filter": "exact_match,flexible-extract"})
3747+
3748+
mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
3749+
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
3750+
37113751
pytorch_config = dict(disable_overlap_scheduler=True,
37123752
cuda_graph_config=CudaGraphConfig())
37133753
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
37143754
dtype=kv_cache_dtype)
37153755

3716-
model_name = "GPT-OSS/120B-MXFP4"
3717-
with LLM(self.MODEL_PATH,
3718-
tensor_parallel_size=4,
3719-
pipeline_parallel_size=1,
3720-
moe_expert_parallel_size=1,
3721-
kv_cache_config=kv_cache_config,
3722-
max_seq_len=8192,
3723-
max_num_tokens=512,
3724-
enable_chunked_prefill=True,
3725-
enable_attention_dp=False,
3726-
moe_config=MoeConfig(backend=moe_backend),
3727-
**pytorch_config) as llm:
3728-
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
3729-
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
3730-
{"scores_filter": "exact_match,flexible-extract"})
3756+
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
3757+
llm = LLM(self.MODEL_PATH,
3758+
tensor_parallel_size=4,
3759+
pipeline_parallel_size=1,
3760+
moe_expert_parallel_size=1,
3761+
kv_cache_config=kv_cache_config,
3762+
max_seq_len=max_seq_len,
3763+
max_num_tokens=512,
3764+
enable_chunked_prefill=True,
3765+
enable_attention_dp=False,
3766+
moe_config=MoeConfig(backend=moe_backend),
3767+
**pytorch_config)
3768+
with llm:
3769+
model_name = "GPT-OSS/120B-MXFP4"
3770+
3771+
# GSM8K
3772+
task = GSM8K(model_name)
3773+
task.evaluate(llm,
3774+
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
3775+
3776+
# GPQA Medium Reasoning
3777+
task = GPQADiamond(model_name)
3778+
3779+
chat_template_kwargs = dict(reasoning_effort="medium")
3780+
extra_evaluator_kwargs = {
3781+
**self.extra_evaluator_kwargs, "chat_template_kwargs":
3782+
chat_template_kwargs
3783+
}
3784+
3785+
sampling_params = SamplingParams(
3786+
temperature=1.0,
3787+
top_p=1.0,
3788+
max_tokens=MAX_OUTPUT_LEN,
3789+
truncate_prompt_tokens=MAX_INPUT_LEN)
3790+
3791+
task.evaluate(llm,
3792+
sampling_params=sampling_params,
3793+
extra_evaluator_kwargs=extra_evaluator_kwargs)
3794+
3795+
@pytest.mark.skip_less_device(4)
3796+
@pytest.mark.parametrize(
3797+
"moe_backend",
3798+
["CUTLASS",
3799+
pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
3800+
ids=["cutlass", "trtllm", "triton"])
3801+
def test_eagle3(self, moe_backend, mocker):
3802+
if moe_backend == "TRITON":
3803+
if not IS_TRITON_KERNELS_AVAILABLE:
3804+
pytest.skip("Triton kernels are not available")
3805+
3806+
MAX_OUTPUT_LEN = 128179
3807+
MAX_INPUT_LEN = 32768
3808+
3809+
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
3810+
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
3811+
{"scores_filter": "exact_match,flexible-extract"})
3812+
3813+
mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
3814+
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
3815+
3816+
# https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue
3817+
pytorch_config = dict(disable_overlap_scheduler=True,
3818+
cuda_graph_config=CudaGraphConfig())
3819+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
3820+
dtype="auto")
3821+
3822+
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
3823+
draft_len = 3
3824+
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
3825+
speculative_model_dir=eagle_model_dir,
3826+
eagle3_one_model=False)
3827+
3828+
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
3829+
llm = LLM(self.MODEL_PATH,
3830+
tensor_parallel_size=4,
3831+
pipeline_parallel_size=1,
3832+
moe_expert_parallel_size=1,
3833+
kv_cache_config=kv_cache_config,
3834+
max_seq_len=max_seq_len,
3835+
speculative_config=spec_config,
3836+
**pytorch_config,
3837+
enable_attention_dp=False,
3838+
moe_config=MoeConfig(backend=moe_backend))
3839+
3840+
with llm:
3841+
model_name = "GPT-OSS/120B-MXFP4"
3842+
3843+
# GSM8K
37313844
task = GSM8K(model_name)
37323845
task.evaluate(llm,
37333846
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
37343847

3848+
# GPQA Medium Reasoning
3849+
task = GPQADiamond(model_name)
3850+
3851+
chat_template_kwargs = dict(reasoning_effort="medium")
3852+
extra_evaluator_kwargs = {
3853+
**self.extra_evaluator_kwargs, "chat_template_kwargs":
3854+
chat_template_kwargs
3855+
}
3856+
3857+
sampling_params = SamplingParams(
3858+
temperature=1.0,
3859+
top_p=1.0,
3860+
max_tokens=MAX_OUTPUT_LEN,
3861+
truncate_prompt_tokens=MAX_INPUT_LEN)
3862+
3863+
task.evaluate(llm,
3864+
sampling_params=sampling_params,
3865+
extra_evaluator_kwargs=extra_evaluator_kwargs)
3866+
37353867

37363868
@skip_pre_hopper
37373869
class TestEXAONE4(LlmapiAccuracyTestHarness):

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
558558
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
559559
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
560560
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
561+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
562+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
563+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
561564
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
562565
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
563566
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram

tests/integration/test_lists/qa/llm_function_core_sanity.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
9797
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
9898
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
9999
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
100+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
101+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
102+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
100103
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
101104
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
102105
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
157157
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
158158
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
159159
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
160+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
161+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
162+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
160163
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
161164
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_auto_dtype
162165
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ l0_dgx_b200:
5151
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]
5252
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
5353
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
54+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
55+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
5456
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
5557
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
5658
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
@@ -190,4 +192,6 @@ l0_dgx_b200:
190192
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]
191193
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
192194
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
195+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
196+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
193197
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,8 @@ l0_dgx_h100:
184184
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
185185
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
186186
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
187+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
188+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
187189
- condition:
188190
ranges:
189191
system_gpu_count:

0 commit comments

Comments
 (0)