Add gpt-oss GSM8K test. (NVIDIA#6732)

Tracin · web-flow · commit 49bcaa4e9570 · 2025-08-10T22:45:43.000-04:00
Signed-off-by: Tracin &lt;10434017+Tracin@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/evaluate/interface.py b/tensorrt_llm/evaluate/interface.py
@@ -33,11 +33,13 @@ class Evaluator(ABC):
     def __init__(self,
                  random_seed: int = 0,
                  apply_chat_template: bool = False,
+                 fewshot_as_multiturn: bool = False,
                  system_prompt: Optional[str] = None):
         random.seed(random_seed)
         np.random.seed(random_seed)
         torch.manual_seed(random_seed)
         self.apply_chat_template = apply_chat_template
+        self.fewshot_as_multiturn = fewshot_as_multiturn
         self.system_prompt = system_prompt
 
     @abstractmethod
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
@@ -133,6 +133,7 @@ def __init__(self,
                  num_samples: Optional[int] = None,
                  random_seed: int = 0,
                  apply_chat_template: bool = False,
+                 fewshot_as_multiturn: bool = False,
                  system_prompt: Optional[str] = None):
         try:
             import lm_eval
@@ -141,8 +142,10 @@ def __init__(self,
                 f"Evaluation task {self.__class__.__name__} requires `lm_eval`. "
                 "Please install the package first, e.g., `pip install lm_eval`."
             ) from e
+        import lm_eval.tasks
         super().__init__(random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
+                         fewshot_as_multiturn=fewshot_as_multiturn,
                          system_prompt=system_prompt)
         self.task_name = task_name
         self.dataset_path = dataset_path
@@ -190,14 +193,16 @@ def compute_score(self, outputs: List[RequestOutput], references: List[str],
     def evaluate(self,
                  llm: Union[LLM, PyTorchLLM],
                  sampling_params: Optional[SamplingParams] = None,
-                 streaming: bool = False) -> float:
+                 streaming: bool = False,
+                 scores_filter: str = None) -> float:
         import lm_eval
-        results = lm_eval.evaluate(lm=LmEvalWrapper(llm, sampling_params,
-                                                    streaming),
-                                   task_dict=self.task_dict,
-                                   limit=self.num_samples,
-                                   apply_chat_template=self.apply_chat_template,
-                                   system_instruction=self.system_prompt)
+        results = lm_eval.evaluate(
+            lm=LmEvalWrapper(llm, sampling_params, streaming),
+            task_dict=self.task_dict,
+            limit=self.num_samples,
+            apply_chat_template=self.apply_chat_template,
+            fewshot_as_multiturn=self.fewshot_as_multiturn,
+            system_instruction=self.system_prompt)
         # Normalize scores to range 0~100
         scores = results["results"][self.task_name]
         for metric in scores.keys():
@@ -206,12 +211,17 @@ def evaluate(self,
         logger.info(
             f"lm-eval {self.task_name} results (scores normalized to range 0~100):\n{lm_eval.utils.make_table(results)}"
         )
-
-        average_acc = np.mean(
-            [acc for m, acc in scores.items() if "_stderr" not in m])
-        logger.info(
-            f"lm-eval {self.task_name} average accuracy: {average_acc:.2f}")
-        return average_acc
+        if scores_filter is not None:
+            result_acc = results["results"][self.task_name][scores_filter]
+            logger.info(
+                f"lm-eval {self.task_name} {scores_filter} accuracy: {result_acc:.2f}"
+            )
+        else:
+            result_acc = np.mean(
+                [acc for m, acc in scores.items() if "_stderr" not in m])
+            logger.info(
+                f"lm-eval {self.task_name} average accuracy: {result_acc:.2f}")
+        return result_acc
 
     @classmethod
     def command_harness(cls, ctx, **kwargs):
@@ -221,6 +231,8 @@ def command_harness(cls, ctx, **kwargs):
                         random_seed=kwargs.pop("random_seed", 0),
                         apply_chat_template=kwargs.pop("apply_chat_template",
                                                        False),
+                        fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
+                                                        False),
                         system_prompt=kwargs.pop("system_prompt", None))
         sampling_params = SamplingParams(
             max_tokens=kwargs.pop("max_output_length"),
@@ -254,6 +266,10 @@ def __init__(self, **kwargs):
                   is_flag=True,
                   default=False,
                   help="Whether to apply chat template.")
+    @click.option("--fewshot_as_multiturn",
+                  is_flag=True,
+                  default=False,
+                  help="Apply fewshot as multiturn.")
     @click.option("--system_prompt",
                   type=str,
                   default=None,
@@ -269,6 +285,10 @@ def __init__(self, **kwargs):
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
+        if kwargs.get("fewshot_as_multiturn", False):
+            assert kwargs.get(
+                "apply_chat_template", False
+            ), "apply_chat_template must be True when fewshot_as_multiturn is True"
         GSM8K.command_harness(ctx, **kwargs)
 
 
diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py
@@ -192,7 +192,11 @@ def evaluate(self,
             evaluator_kwargs.update(extra_evaluator_kwargs)
         evaluator = self.EVALUATOR_CLS(num_samples=num_samples,
                                        **evaluator_kwargs)
-        accuracy = evaluator.evaluate(llm, sampling_params, streaming)
+        evaluate_kwargs = {}
+        if hasattr(self, 'EVALUATE_KWARGS'):
+            evaluate_kwargs.update(self.EVALUATE_KWARGS)
+        accuracy = evaluator.evaluate(llm, sampling_params, streaming,
+                                      **evaluate_kwargs)
         if self.HIGHER_IS_BETTER:
             assert accuracy >= threshold, f"Expected accuracy >= {threshold}, but got {accuracy}."
         else:
@@ -298,6 +302,8 @@ class GSM8K(AccuracyTask):
     EVALUATOR_CLS = tensorrt_llm.evaluate.GSM8K
     EVALUATOR_KWARGS = dict(dataset_path=DATASET_DIR, random_seed=0)
 
+    EVALUATE_KWARGS = dict(scores_filter=None)
+
 
 class GPQADiamond(AccuracyTask):
     DATASET = "gpqa_diamond"
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -159,12 +159,12 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
 microsoft/Phi-4-mini-instruct:
   - accuracy: 82.30
 GPT-OSS/BF16:
-  - accuracy: 88.5
+  - accuracy: 90.3
 GPT-OSS/MXFP4:
-  - accuracy: 88.5
+  - accuracy: 90.3
   - quant_algo: W4A8_MXFP4_MXFP8
-    accuracy: 88.5
+    accuracy: 90.3
   - quant_algo: W4A8_MXFP4_FP8
-    accuracy: 88.5
+    accuracy: 90.3
 LGAI-EXAONE/EXAONE-4.0-32B:
   - accuracy: 88.36
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2463,10 +2463,14 @@ def test_auto_dtype_long_rope(self):
 class TestGPTOSS(LlmapiAccuracyTestHarness):
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
 
-    def get_gpt_oss_root(self):
-        gpt_oss_root = os.getenv("GPT_OSS_MODELS_ROOT")
-        assert gpt_oss_root, "GPT_OSS_MODELS_ROOT needs to be set as parent of checkpoints."
-        return gpt_oss_root
+    MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-120b"
+
+    def update_task_kwargs(self, task):
+        task.EVALUATOR_KWARGS["fewshot_as_multiturn"] = True
+        task.EVALUATOR_KWARGS["apply_chat_template"] = True
+        task.EVALUATE_KWARGS["scores_filter"] = "exact_match,flexible-extract"
+        task.MAX_OUTPUT_LEN = 8192
+        return task
 
     @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"],
                              ids=["cutlass", "trtllm", "triton"])
@@ -2481,7 +2485,7 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
 
-        llm = LLM(f"{self.get_gpt_oss_root()}/gpt-oss-120b",
+        llm = LLM(self.MODEL_PATH,
                   tensor_parallel_size=1,
                   pipeline_parallel_size=1,
                   moe_expert_parallel_size=1,
@@ -2491,9 +2495,8 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):
 
         with llm:
             model_name = "GPT-OSS/MXFP4"
-            task = MMLU(model_name)
-            task.evaluate(llm)
             task = GSM8K(model_name)
+            task = self.update_task_kwargs(task)
             task.evaluate(llm)
 
     @pytest.mark.skip_less_device(4)
@@ -2519,7 +2522,7 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
 
-        llm = LLM(f"{self.get_gpt_oss_root()}/gpt-oss-120b",
+        llm = LLM(self.MODEL_PATH,
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
@@ -2530,9 +2533,8 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
 
         with llm:
             model_name = "GPT-OSS/MXFP4"
-            task = MMLU(model_name)
-            task.evaluate(llm)
             task = GSM8K(model_name)
+            task = self.update_task_kwargs(task)
             task.evaluate(llm)
 
     @pytest.mark.skip_less_device(4)
@@ -2551,7 +2553,7 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
 
-        llm = LLM(f"{self.get_openai_root()}/gpt-oss-120b",
+        llm = LLM(self.MODEL_PATH,
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
@@ -2561,9 +2563,8 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
                   moe_backend="TRITON")
         with llm:
             model_name = "GPT-OSS/BF16"
-            task = MMLU(model_name)
-            task.evaluate(llm)
             task = GSM8K(model_name)
+            task = self.update_task_kwargs(task)
             task.evaluate(llm)
 
 
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -519,6 +519,19 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -54,6 +54,9 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -69,3 +69,13 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -121,6 +121,27 @@ l0_dgx_h100:
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16]
   - disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16]
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16]
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*h100*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: pre_merge
+      backend: pytorch
+      auto_trigger: gpt_oss
+  tests:
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
 - condition:
     ranges:
       system_gpu_count: