Skip to content

Commit 49bcaa4

Browse files
authored
Add gpt-oss GSM8K test. (NVIDIA#6732)
Signed-off-by: Tracin <[email protected]>
1 parent c566a8d commit 49bcaa4

File tree

9 files changed

+107
-31
lines changed

9 files changed

+107
-31
lines changed

tensorrt_llm/evaluate/interface.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,13 @@ class Evaluator(ABC):
3333
def __init__(self,
3434
random_seed: int = 0,
3535
apply_chat_template: bool = False,
36+
fewshot_as_multiturn: bool = False,
3637
system_prompt: Optional[str] = None):
3738
random.seed(random_seed)
3839
np.random.seed(random_seed)
3940
torch.manual_seed(random_seed)
4041
self.apply_chat_template = apply_chat_template
42+
self.fewshot_as_multiturn = fewshot_as_multiturn
4143
self.system_prompt = system_prompt
4244

4345
@abstractmethod

tensorrt_llm/evaluate/lm_eval.py

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ def __init__(self,
133133
num_samples: Optional[int] = None,
134134
random_seed: int = 0,
135135
apply_chat_template: bool = False,
136+
fewshot_as_multiturn: bool = False,
136137
system_prompt: Optional[str] = None):
137138
try:
138139
import lm_eval
@@ -141,8 +142,10 @@ def __init__(self,
141142
f"Evaluation task {self.__class__.__name__} requires `lm_eval`. "
142143
"Please install the package first, e.g., `pip install lm_eval`."
143144
) from e
145+
import lm_eval.tasks
144146
super().__init__(random_seed=random_seed,
145147
apply_chat_template=apply_chat_template,
148+
fewshot_as_multiturn=fewshot_as_multiturn,
146149
system_prompt=system_prompt)
147150
self.task_name = task_name
148151
self.dataset_path = dataset_path
@@ -190,14 +193,16 @@ def compute_score(self, outputs: List[RequestOutput], references: List[str],
190193
def evaluate(self,
191194
llm: Union[LLM, PyTorchLLM],
192195
sampling_params: Optional[SamplingParams] = None,
193-
streaming: bool = False) -> float:
196+
streaming: bool = False,
197+
scores_filter: str = None) -> float:
194198
import lm_eval
195-
results = lm_eval.evaluate(lm=LmEvalWrapper(llm, sampling_params,
196-
streaming),
197-
task_dict=self.task_dict,
198-
limit=self.num_samples,
199-
apply_chat_template=self.apply_chat_template,
200-
system_instruction=self.system_prompt)
199+
results = lm_eval.evaluate(
200+
lm=LmEvalWrapper(llm, sampling_params, streaming),
201+
task_dict=self.task_dict,
202+
limit=self.num_samples,
203+
apply_chat_template=self.apply_chat_template,
204+
fewshot_as_multiturn=self.fewshot_as_multiturn,
205+
system_instruction=self.system_prompt)
201206
# Normalize scores to range 0~100
202207
scores = results["results"][self.task_name]
203208
for metric in scores.keys():
@@ -206,12 +211,17 @@ def evaluate(self,
206211
logger.info(
207212
f"lm-eval {self.task_name} results (scores normalized to range 0~100):\n{lm_eval.utils.make_table(results)}"
208213
)
209-
210-
average_acc = np.mean(
211-
[acc for m, acc in scores.items() if "_stderr" not in m])
212-
logger.info(
213-
f"lm-eval {self.task_name} average accuracy: {average_acc:.2f}")
214-
return average_acc
214+
if scores_filter is not None:
215+
result_acc = results["results"][self.task_name][scores_filter]
216+
logger.info(
217+
f"lm-eval {self.task_name} {scores_filter} accuracy: {result_acc:.2f}"
218+
)
219+
else:
220+
result_acc = np.mean(
221+
[acc for m, acc in scores.items() if "_stderr" not in m])
222+
logger.info(
223+
f"lm-eval {self.task_name} average accuracy: {result_acc:.2f}")
224+
return result_acc
215225

216226
@classmethod
217227
def command_harness(cls, ctx, **kwargs):
@@ -221,6 +231,8 @@ def command_harness(cls, ctx, **kwargs):
221231
random_seed=kwargs.pop("random_seed", 0),
222232
apply_chat_template=kwargs.pop("apply_chat_template",
223233
False),
234+
fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
235+
False),
224236
system_prompt=kwargs.pop("system_prompt", None))
225237
sampling_params = SamplingParams(
226238
max_tokens=kwargs.pop("max_output_length"),
@@ -254,6 +266,10 @@ def __init__(self, **kwargs):
254266
is_flag=True,
255267
default=False,
256268
help="Whether to apply chat template.")
269+
@click.option("--fewshot_as_multiturn",
270+
is_flag=True,
271+
default=False,
272+
help="Apply fewshot as multiturn.")
257273
@click.option("--system_prompt",
258274
type=str,
259275
default=None,
@@ -269,6 +285,10 @@ def __init__(self, **kwargs):
269285
@click.pass_context
270286
@staticmethod
271287
def command(ctx, **kwargs) -> None:
288+
if kwargs.get("fewshot_as_multiturn", False):
289+
assert kwargs.get(
290+
"apply_chat_template", False
291+
), "apply_chat_template must be True when fewshot_as_multiturn is True"
272292
GSM8K.command_harness(ctx, **kwargs)
273293

274294

tests/integration/defs/accuracy/accuracy_core.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,11 @@ def evaluate(self,
192192
evaluator_kwargs.update(extra_evaluator_kwargs)
193193
evaluator = self.EVALUATOR_CLS(num_samples=num_samples,
194194
**evaluator_kwargs)
195-
accuracy = evaluator.evaluate(llm, sampling_params, streaming)
195+
evaluate_kwargs = {}
196+
if hasattr(self, 'EVALUATE_KWARGS'):
197+
evaluate_kwargs.update(self.EVALUATE_KWARGS)
198+
accuracy = evaluator.evaluate(llm, sampling_params, streaming,
199+
**evaluate_kwargs)
196200
if self.HIGHER_IS_BETTER:
197201
assert accuracy >= threshold, f"Expected accuracy >= {threshold}, but got {accuracy}."
198202
else:
@@ -298,6 +302,8 @@ class GSM8K(AccuracyTask):
298302
EVALUATOR_CLS = tensorrt_llm.evaluate.GSM8K
299303
EVALUATOR_KWARGS = dict(dataset_path=DATASET_DIR, random_seed=0)
300304

305+
EVALUATE_KWARGS = dict(scores_filter=None)
306+
301307

302308
class GPQADiamond(AccuracyTask):
303309
DATASET = "gpqa_diamond"

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -159,12 +159,12 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
159159
microsoft/Phi-4-mini-instruct:
160160
- accuracy: 82.30
161161
GPT-OSS/BF16:
162-
- accuracy: 88.5
162+
- accuracy: 90.3
163163
GPT-OSS/MXFP4:
164-
- accuracy: 88.5
164+
- accuracy: 90.3
165165
- quant_algo: W4A8_MXFP4_MXFP8
166-
accuracy: 88.5
166+
accuracy: 90.3
167167
- quant_algo: W4A8_MXFP4_FP8
168-
accuracy: 88.5
168+
accuracy: 90.3
169169
LGAI-EXAONE/EXAONE-4.0-32B:
170170
- accuracy: 88.36

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2463,10 +2463,14 @@ def test_auto_dtype_long_rope(self):
24632463
class TestGPTOSS(LlmapiAccuracyTestHarness):
24642464
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
24652465

2466-
def get_gpt_oss_root(self):
2467-
gpt_oss_root = os.getenv("GPT_OSS_MODELS_ROOT")
2468-
assert gpt_oss_root, "GPT_OSS_MODELS_ROOT needs to be set as parent of checkpoints."
2469-
return gpt_oss_root
2466+
MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-120b"
2467+
2468+
def update_task_kwargs(self, task):
2469+
task.EVALUATOR_KWARGS["fewshot_as_multiturn"] = True
2470+
task.EVALUATOR_KWARGS["apply_chat_template"] = True
2471+
task.EVALUATE_KWARGS["scores_filter"] = "exact_match,flexible-extract"
2472+
task.MAX_OUTPUT_LEN = 8192
2473+
return task
24702474

24712475
@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"],
24722476
ids=["cutlass", "trtllm", "triton"])
@@ -2481,7 +2485,7 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):
24812485
disable_overlap_scheduler=not overlap_scheduler,
24822486
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
24832487

2484-
llm = LLM(f"{self.get_gpt_oss_root()}/gpt-oss-120b",
2488+
llm = LLM(self.MODEL_PATH,
24852489
tensor_parallel_size=1,
24862490
pipeline_parallel_size=1,
24872491
moe_expert_parallel_size=1,
@@ -2491,9 +2495,8 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):
24912495

24922496
with llm:
24932497
model_name = "GPT-OSS/MXFP4"
2494-
task = MMLU(model_name)
2495-
task.evaluate(llm)
24962498
task = GSM8K(model_name)
2499+
task = self.update_task_kwargs(task)
24972500
task.evaluate(llm)
24982501

24992502
@pytest.mark.skip_less_device(4)
@@ -2519,7 +2522,7 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
25192522
disable_overlap_scheduler=not overlap_scheduler,
25202523
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
25212524

2522-
llm = LLM(f"{self.get_gpt_oss_root()}/gpt-oss-120b",
2525+
llm = LLM(self.MODEL_PATH,
25232526
tensor_parallel_size=tp_size,
25242527
pipeline_parallel_size=pp_size,
25252528
moe_expert_parallel_size=ep_size,
@@ -2530,9 +2533,8 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
25302533

25312534
with llm:
25322535
model_name = "GPT-OSS/MXFP4"
2533-
task = MMLU(model_name)
2534-
task.evaluate(llm)
25352536
task = GSM8K(model_name)
2537+
task = self.update_task_kwargs(task)
25362538
task.evaluate(llm)
25372539

25382540
@pytest.mark.skip_less_device(4)
@@ -2551,7 +2553,7 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
25512553
disable_overlap_scheduler=not overlap_scheduler,
25522554
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
25532555

2554-
llm = LLM(f"{self.get_openai_root()}/gpt-oss-120b",
2556+
llm = LLM(self.MODEL_PATH,
25552557
tensor_parallel_size=tp_size,
25562558
pipeline_parallel_size=pp_size,
25572559
moe_expert_parallel_size=ep_size,
@@ -2561,9 +2563,8 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
25612563
moe_backend="TRITON")
25622564
with llm:
25632565
model_name = "GPT-OSS/BF16"
2564-
task = MMLU(model_name)
2565-
task.evaluate(llm)
25662566
task = GSM8K(model_name)
2567+
task = self.update_task_kwargs(task)
25672568
task.evaluate(llm)
25682569

25692570

tests/integration/test_lists/qa/llm_function_full.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,19 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency
519519
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
520520
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
521521
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
522+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
523+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
524+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
525+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
526+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRTLLM]
527+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
528+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
529+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRTLLM]
530+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
531+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
532+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM]
533+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
534+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
522535
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False]
523536
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
524537
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ l0_b200:
5454
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
5555
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
5656
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
57+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
58+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
59+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
5760
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551
5861
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
5962
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,13 @@ l0_dgx_b200:
6969
- accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
7070
- accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
7171
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
72+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
73+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRTLLM]
74+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
75+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
76+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRTLLM]
77+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
78+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
79+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM]
80+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
81+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,27 @@ l0_dgx_h100:
121121
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16]
122122
- disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16]
123123
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16]
124+
- condition:
125+
ranges:
126+
system_gpu_count:
127+
gte: 4
128+
lte: 4
129+
wildcards:
130+
gpu:
131+
- '*h100*'
132+
linux_distribution_name: ubuntu*
133+
terms:
134+
stage: pre_merge
135+
backend: pytorch
136+
auto_trigger: gpt_oss
137+
tests:
138+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
139+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
140+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
141+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
142+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
143+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
144+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
124145
- condition:
125146
ranges:
126147
system_gpu_count:

0 commit comments

Comments
 (0)