[None][test] Add post merge test for Seed-OSS-36B-Instruct (NVIDIA#8321)

zhhuang-nv · web-flow · commit 7a2bab93f016 · 2025-10-17T02:30:33.000-07:00
Signed-off-by: Zhen Huang &lt;145532724+zhhuang-nv@users.noreply.github.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -2503,7 +2503,8 @@ def launchTestJobs(pipeline, testFilter)
         // "H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5],
         "H100_PCIe-FMHA-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
         "B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
-        "B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
+        "B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
+        "B200_PCIe-PyTorch-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
         // "B200_PCIe-TensorRT-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
         // "B200_PCIe-TensorRT-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
         "H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
diff --git a/tensorrt_llm/evaluate/interface.py b/tensorrt_llm/evaluate/interface.py
@@ -34,13 +34,15 @@ def __init__(self,
                  random_seed: int = 0,
                  apply_chat_template: bool = False,
                  fewshot_as_multiturn: bool = False,
-                 system_prompt: Optional[str] = None):
+                 system_prompt: Optional[str] = None,
+                 chat_template_kwargs: Optional[dict[str, Any]] = None):
         random.seed(random_seed)
         np.random.seed(random_seed)
         torch.manual_seed(random_seed)
         self.apply_chat_template = apply_chat_template
         self.fewshot_as_multiturn = fewshot_as_multiturn
         self.system_prompt = system_prompt
+        self.chat_template_kwargs = chat_template_kwargs
 
     @abstractmethod
     def generate_samples(self) -> Iterable[tuple]:
@@ -64,7 +66,9 @@ def do_apply_chat_template(self, llm: Any,
             }] + messages
         return llm.tokenizer.apply_chat_template(messages,
                                                  tokenize=False,
-                                                 add_generation_prompt=True)
+                                                 add_generation_prompt=True,
+                                                 **(self.chat_template_kwargs
+                                                    or {}))
 
     def _get_sampline_params(self, sampling_params: Optional[SamplingParams],
                              sampling_args: Optional[dict]) -> SamplingParams:
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
@@ -16,7 +16,7 @@
 import json
 import os
 from contextlib import contextmanager
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import click
 import numpy as np
@@ -51,11 +51,13 @@ class LmEvalWrapper(TemplateLM):
     def __init__(self,
                  llm: Union[LLM, PyTorchLLM],
                  sampling_params: Optional[SamplingParams] = None,
-                 streaming: bool = False):
+                 streaming: bool = False,
+                 chat_template_kwargs: Optional[dict[str, Any]] = None):
         super().__init__()
         self.llm = llm
         self.sampling_params = sampling_params
         self.streaming = streaming
+        self.chat_template_kwargs = chat_template_kwargs
 
     @property
     def eot_token_id(self) -> int:
@@ -72,6 +74,7 @@ def apply_chat_template(self,
             tokenize=False,
             add_generation_prompt=add_generation_prompt,
             continue_final_message=not add_generation_prompt,
+            **(self.chat_template_kwargs or {}),
         )
 
     @property
@@ -146,7 +149,8 @@ def __init__(self,
                  llm: Union[LLM, PyTorchLLM],
                  sampling_params: Optional[SamplingParams] = None,
                  streaming: bool = False,
-                 max_images: int = 999):
+                 max_images: int = 999,
+                 chat_template_kwargs: Optional[dict[str, Any]] = None):
         """
         Initialize the multimodal wrapper.
 
@@ -161,6 +165,7 @@ def __init__(self,
         # NOTE: Required by lm_eval to identify this as a multimodal model
         self.MULTIMODAL = True
         self.max_images = max_images
+        self.chat_template_kwargs = chat_template_kwargs
         self.model_type = self._get_model_type(llm)
 
         # NOTE: In TRT-LLM, currently we do not support interleaved text and image. Instead, we are adding image placeholders at the end of the text or at the beginning of the text.
@@ -237,7 +242,9 @@ def apply_chat_template(self,
             mm_placeholder_counts=mm_placeholder_counts,
             tools=None,
             chat_template_kwargs={
-                "continue_final_message": not add_generation_prompt
+                **(self.chat_template_kwargs or {}),
+                "continue_final_message":
+                not add_generation_prompt,
             })
         return output
 
@@ -301,7 +308,8 @@ def __init__(self,
                  apply_chat_template: bool = False,
                  fewshot_as_multiturn: bool = False,
                  system_prompt: Optional[str] = None,
-                 is_multimodal: bool = False):
+                 is_multimodal: bool = False,
+                 chat_template_kwargs: Optional[dict[str, Any]] = None):
         try:
             import lm_eval
         except ImportError as e:
@@ -319,7 +327,8 @@ def __init__(self,
         super().__init__(random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
                          fewshot_as_multiturn=fewshot_as_multiturn,
-                         system_prompt=system_prompt)
+                         system_prompt=system_prompt,
+                         chat_template_kwargs=chat_template_kwargs)
         self.task_name = task_name
         self.dataset_path = dataset_path
         self.num_samples = num_samples
@@ -390,7 +399,10 @@ def evaluate(self,
         import lm_eval
         lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper
         results = lm_eval.evaluate(
-            lm=lm_cls(llm, sampling_params, streaming),
+            lm=lm_cls(llm,
+                      sampling_params=sampling_params,
+                      streaming=streaming,
+                      chat_template_kwargs=self.chat_template_kwargs),
             task_dict=self.task_dict,
             limit=self.num_samples,
             apply_chat_template=self.apply_chat_template,
@@ -428,7 +440,9 @@ def command_harness(cls, ctx, **kwargs):
                         fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
                                                         False),
                         system_prompt=kwargs.pop("system_prompt", None),
-                        is_multimodal=kwargs.pop("is_multimodal", False))
+                        is_multimodal=kwargs.pop("is_multimodal", False),
+                        chat_template_kwargs=kwargs.pop("chat_template_kwargs",
+                                                        None))
         sampling_params = SamplingParams(
             max_tokens=kwargs.pop("max_output_length"),
             truncate_prompt_tokens=kwargs.pop("max_input_length"),
@@ -462,6 +476,13 @@ def __init__(self, **kwargs):
                   is_flag=True,
                   default=False,
                   help="Whether to apply chat template.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
     @click.option("--fewshot_as_multiturn",
                   is_flag=True,
                   default=False,
@@ -513,6 +534,13 @@ def __init__(self, **kwargs):
                   is_flag=True,
                   default=False,
                   help="Whether to apply chat template.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
     @click.option("--system_prompt",
                   type=str,
                   default=None,
@@ -556,6 +584,13 @@ def __init__(self, **kwargs):
                   is_flag=True,
                   default=False,
                   help="Whether to apply chat template.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
     @click.option("--system_prompt",
                   type=str,
                   default=None,
@@ -599,6 +634,13 @@ def __init__(self, **kwargs):
                   is_flag=True,
                   default=False,
                   help="Whether to apply chat template.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
     @click.option("--system_prompt",
                   type=str,
                   default=None,
@@ -638,6 +680,13 @@ def __init__(self, **kwargs):
                   type=int,
                   default=0,
                   help="Random seed for dataset processing.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
     @click.option(
         "--system_prompt",
         type=str,
diff --git a/tensorrt_llm/evaluate/mmlu.py b/tensorrt_llm/evaluate/mmlu.py
@@ -21,6 +21,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import json
 # Not a contribution
 # Changes made by NVIDIA CORPORATION & AFFILIATES or otherwise documented as
 # NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
@@ -34,7 +35,7 @@
 # without an express license agreement from NVIDIA CORPORATION or
 # its affiliates is strictly prohibited.
 import math
-from typing import Iterable, List, Optional, Union
+from typing import Any, Iterable, List, Optional, Union
 
 import click
 import numpy as np
@@ -137,10 +138,12 @@ def __init__(self,
                  num_fewshot: int = 5,
                  random_seed: int = 0,
                  apply_chat_template: bool = False,
-                 system_prompt: Optional[str] = None):
+                 system_prompt: Optional[str] = None,
+                 chat_template_kwargs: Optional[dict[str, Any]] = None):
         super().__init__(random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
-                         system_prompt=system_prompt)
+                         system_prompt=system_prompt,
+                         chat_template_kwargs=chat_template_kwargs)
         if dataset_path is None:
             dataset_path = self.dowload_dataset()
         self.dataset_path = dataset_path
@@ -296,6 +299,13 @@ def compute_score(self, outputs: List[RequestOutput], references: List[str],
                   is_flag=True,
                   default=False,
                   help="Whether to apply chat template.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
     @click.option("--system_prompt",
                   type=str,
                   default=None,
@@ -314,6 +324,7 @@ def compute_score(self, outputs: List[RequestOutput], references: List[str],
     @staticmethod
     def command(ctx, dataset_path: Optional[str], num_samples: int,
                 num_fewshot: int, random_seed: int, apply_chat_template: bool,
+                chat_template_kwargs: Optional[dict[str, Any]],
                 system_prompt: Optional[str], max_input_length: int,
                 max_output_length: int, check_accuracy: bool,
                 accuracy_threshold: float) -> None:
@@ -326,7 +337,8 @@ def command(ctx, dataset_path: Optional[str], num_samples: int,
                          num_fewshot=num_fewshot,
                          random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
-                         system_prompt=system_prompt)
+                         system_prompt=system_prompt,
+                         chat_template_kwargs=chat_template_kwargs)
         accuracy = evaluator.evaluate(llm, sampling_params)
         llm.shutdown()
 
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -221,3 +221,5 @@ GPT-OSS/MXFP4:
     accuracy: 90.3
 LGAI-EXAONE/EXAONE-4.0-32B:
   - accuracy: 88.36
+ByteDance-Seed/Seed-OSS-36B-Instruct:
+  - accuracy: 90.8
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -3656,3 +3656,26 @@ def test_auto_dtype(self):
                  kv_cache_config=self.kv_cache_config) as llm:
             task = MMMU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=self.sampling_params)
+
+
+class TestSeedOss_36B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "ByteDance-Seed/Seed-OSS-36B-Instruct"
+    MODEL_PATH = f"{llm_models_root()}/Seed-OSS/Seed-OSS-36B-Instruct"
+
+    gsm8k_sampling_params = SamplingParams(temperature=1.1,
+                                           top_p=0.95,
+                                           max_tokens=16384)
+
+    @skip_pre_hopper
+    @pytest.mark.skip_less_device_memory(140000)
+    def test_auto_dtype(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
+        chat_template_kwargs = dict(thinking_budget=-1)
+
+        with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm,
+                          sampling_params=self.gsm8k_sampling_params,
+                          extra_evaluator_kwargs=dict(
+                              apply_chat_template=True,
+                              chat_template_kwargs=chat_template_kwargs))
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -609,6 +609,7 @@ accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
 accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
+accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
 
 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
 test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
@@ -189,6 +189,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-laten
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
+accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -146,3 +146,4 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype