Skip to content

Commit 7a2bab9

Browse files
authored
[None][test] Add post merge test for Seed-OSS-36B-Instruct (NVIDIA#8321)
Signed-off-by: Zhen Huang <[email protected]>
1 parent e72ade3 commit 7a2bab9

File tree

9 files changed

+109
-15
lines changed

9 files changed

+109
-15
lines changed

jenkins/L0_Test.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2503,7 +2503,8 @@ def launchTestJobs(pipeline, testFilter)
25032503
// "H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5],
25042504
"H100_PCIe-FMHA-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
25052505
"B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
2506-
"B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
2506+
"B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
2507+
"B200_PCIe-PyTorch-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
25072508
// "B200_PCIe-TensorRT-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
25082509
// "B200_PCIe-TensorRT-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
25092510
"H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],

tensorrt_llm/evaluate/interface.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,15 @@ def __init__(self,
3434
random_seed: int = 0,
3535
apply_chat_template: bool = False,
3636
fewshot_as_multiturn: bool = False,
37-
system_prompt: Optional[str] = None):
37+
system_prompt: Optional[str] = None,
38+
chat_template_kwargs: Optional[dict[str, Any]] = None):
3839
random.seed(random_seed)
3940
np.random.seed(random_seed)
4041
torch.manual_seed(random_seed)
4142
self.apply_chat_template = apply_chat_template
4243
self.fewshot_as_multiturn = fewshot_as_multiturn
4344
self.system_prompt = system_prompt
45+
self.chat_template_kwargs = chat_template_kwargs
4446

4547
@abstractmethod
4648
def generate_samples(self) -> Iterable[tuple]:
@@ -64,7 +66,9 @@ def do_apply_chat_template(self, llm: Any,
6466
}] + messages
6567
return llm.tokenizer.apply_chat_template(messages,
6668
tokenize=False,
67-
add_generation_prompt=True)
69+
add_generation_prompt=True,
70+
**(self.chat_template_kwargs
71+
or {}))
6872

6973
def _get_sampline_params(self, sampling_params: Optional[SamplingParams],
7074
sampling_args: Optional[dict]) -> SamplingParams:

tensorrt_llm/evaluate/lm_eval.py

Lines changed: 57 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import json
1717
import os
1818
from contextlib import contextmanager
19-
from typing import Dict, Iterable, List, Optional, Tuple, Union
19+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
2020

2121
import click
2222
import numpy as np
@@ -51,11 +51,13 @@ class LmEvalWrapper(TemplateLM):
5151
def __init__(self,
5252
llm: Union[LLM, PyTorchLLM],
5353
sampling_params: Optional[SamplingParams] = None,
54-
streaming: bool = False):
54+
streaming: bool = False,
55+
chat_template_kwargs: Optional[dict[str, Any]] = None):
5556
super().__init__()
5657
self.llm = llm
5758
self.sampling_params = sampling_params
5859
self.streaming = streaming
60+
self.chat_template_kwargs = chat_template_kwargs
5961

6062
@property
6163
def eot_token_id(self) -> int:
@@ -72,6 +74,7 @@ def apply_chat_template(self,
7274
tokenize=False,
7375
add_generation_prompt=add_generation_prompt,
7476
continue_final_message=not add_generation_prompt,
77+
**(self.chat_template_kwargs or {}),
7578
)
7679

7780
@property
@@ -146,7 +149,8 @@ def __init__(self,
146149
llm: Union[LLM, PyTorchLLM],
147150
sampling_params: Optional[SamplingParams] = None,
148151
streaming: bool = False,
149-
max_images: int = 999):
152+
max_images: int = 999,
153+
chat_template_kwargs: Optional[dict[str, Any]] = None):
150154
"""
151155
Initialize the multimodal wrapper.
152156
@@ -161,6 +165,7 @@ def __init__(self,
161165
# NOTE: Required by lm_eval to identify this as a multimodal model
162166
self.MULTIMODAL = True
163167
self.max_images = max_images
168+
self.chat_template_kwargs = chat_template_kwargs
164169
self.model_type = self._get_model_type(llm)
165170

166171
# NOTE: In TRT-LLM, currently we do not support interleaved text and image. Instead, we are adding image placeholders at the end of the text or at the beginning of the text.
@@ -237,7 +242,9 @@ def apply_chat_template(self,
237242
mm_placeholder_counts=mm_placeholder_counts,
238243
tools=None,
239244
chat_template_kwargs={
240-
"continue_final_message": not add_generation_prompt
245+
**(self.chat_template_kwargs or {}),
246+
"continue_final_message":
247+
not add_generation_prompt,
241248
})
242249
return output
243250

@@ -301,7 +308,8 @@ def __init__(self,
301308
apply_chat_template: bool = False,
302309
fewshot_as_multiturn: bool = False,
303310
system_prompt: Optional[str] = None,
304-
is_multimodal: bool = False):
311+
is_multimodal: bool = False,
312+
chat_template_kwargs: Optional[dict[str, Any]] = None):
305313
try:
306314
import lm_eval
307315
except ImportError as e:
@@ -319,7 +327,8 @@ def __init__(self,
319327
super().__init__(random_seed=random_seed,
320328
apply_chat_template=apply_chat_template,
321329
fewshot_as_multiturn=fewshot_as_multiturn,
322-
system_prompt=system_prompt)
330+
system_prompt=system_prompt,
331+
chat_template_kwargs=chat_template_kwargs)
323332
self.task_name = task_name
324333
self.dataset_path = dataset_path
325334
self.num_samples = num_samples
@@ -390,7 +399,10 @@ def evaluate(self,
390399
import lm_eval
391400
lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper
392401
results = lm_eval.evaluate(
393-
lm=lm_cls(llm, sampling_params, streaming),
402+
lm=lm_cls(llm,
403+
sampling_params=sampling_params,
404+
streaming=streaming,
405+
chat_template_kwargs=self.chat_template_kwargs),
394406
task_dict=self.task_dict,
395407
limit=self.num_samples,
396408
apply_chat_template=self.apply_chat_template,
@@ -428,7 +440,9 @@ def command_harness(cls, ctx, **kwargs):
428440
fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
429441
False),
430442
system_prompt=kwargs.pop("system_prompt", None),
431-
is_multimodal=kwargs.pop("is_multimodal", False))
443+
is_multimodal=kwargs.pop("is_multimodal", False),
444+
chat_template_kwargs=kwargs.pop("chat_template_kwargs",
445+
None))
432446
sampling_params = SamplingParams(
433447
max_tokens=kwargs.pop("max_output_length"),
434448
truncate_prompt_tokens=kwargs.pop("max_input_length"),
@@ -462,6 +476,13 @@ def __init__(self, **kwargs):
462476
is_flag=True,
463477
default=False,
464478
help="Whether to apply chat template.")
479+
@click.option(
480+
"--chat_template_kwargs",
481+
type=str,
482+
default=None,
483+
callback=lambda ctx, param, value: json.loads(value) if value else None,
484+
help=
485+
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
465486
@click.option("--fewshot_as_multiturn",
466487
is_flag=True,
467488
default=False,
@@ -513,6 +534,13 @@ def __init__(self, **kwargs):
513534
is_flag=True,
514535
default=False,
515536
help="Whether to apply chat template.")
537+
@click.option(
538+
"--chat_template_kwargs",
539+
type=str,
540+
default=None,
541+
callback=lambda ctx, param, value: json.loads(value) if value else None,
542+
help=
543+
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
516544
@click.option("--system_prompt",
517545
type=str,
518546
default=None,
@@ -556,6 +584,13 @@ def __init__(self, **kwargs):
556584
is_flag=True,
557585
default=False,
558586
help="Whether to apply chat template.")
587+
@click.option(
588+
"--chat_template_kwargs",
589+
type=str,
590+
default=None,
591+
callback=lambda ctx, param, value: json.loads(value) if value else None,
592+
help=
593+
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
559594
@click.option("--system_prompt",
560595
type=str,
561596
default=None,
@@ -599,6 +634,13 @@ def __init__(self, **kwargs):
599634
is_flag=True,
600635
default=False,
601636
help="Whether to apply chat template.")
637+
@click.option(
638+
"--chat_template_kwargs",
639+
type=str,
640+
default=None,
641+
callback=lambda ctx, param, value: json.loads(value) if value else None,
642+
help=
643+
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
602644
@click.option("--system_prompt",
603645
type=str,
604646
default=None,
@@ -638,6 +680,13 @@ def __init__(self, **kwargs):
638680
type=int,
639681
default=0,
640682
help="Random seed for dataset processing.")
683+
@click.option(
684+
"--chat_template_kwargs",
685+
type=str,
686+
default=None,
687+
callback=lambda ctx, param, value: json.loads(value) if value else None,
688+
help=
689+
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
641690
@click.option(
642691
"--system_prompt",
643692
type=str,

tensorrt_llm/evaluate/mmlu.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2222
# SOFTWARE.
2323

24+
import json
2425
# Not a contribution
2526
# Changes made by NVIDIA CORPORATION & AFFILIATES or otherwise documented as
2627
# NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
@@ -34,7 +35,7 @@
3435
# without an express license agreement from NVIDIA CORPORATION or
3536
# its affiliates is strictly prohibited.
3637
import math
37-
from typing import Iterable, List, Optional, Union
38+
from typing import Any, Iterable, List, Optional, Union
3839

3940
import click
4041
import numpy as np
@@ -137,10 +138,12 @@ def __init__(self,
137138
num_fewshot: int = 5,
138139
random_seed: int = 0,
139140
apply_chat_template: bool = False,
140-
system_prompt: Optional[str] = None):
141+
system_prompt: Optional[str] = None,
142+
chat_template_kwargs: Optional[dict[str, Any]] = None):
141143
super().__init__(random_seed=random_seed,
142144
apply_chat_template=apply_chat_template,
143-
system_prompt=system_prompt)
145+
system_prompt=system_prompt,
146+
chat_template_kwargs=chat_template_kwargs)
144147
if dataset_path is None:
145148
dataset_path = self.dowload_dataset()
146149
self.dataset_path = dataset_path
@@ -296,6 +299,13 @@ def compute_score(self, outputs: List[RequestOutput], references: List[str],
296299
is_flag=True,
297300
default=False,
298301
help="Whether to apply chat template.")
302+
@click.option(
303+
"--chat_template_kwargs",
304+
type=str,
305+
default=None,
306+
callback=lambda ctx, param, value: json.loads(value) if value else None,
307+
help=
308+
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
299309
@click.option("--system_prompt",
300310
type=str,
301311
default=None,
@@ -314,6 +324,7 @@ def compute_score(self, outputs: List[RequestOutput], references: List[str],
314324
@staticmethod
315325
def command(ctx, dataset_path: Optional[str], num_samples: int,
316326
num_fewshot: int, random_seed: int, apply_chat_template: bool,
327+
chat_template_kwargs: Optional[dict[str, Any]],
317328
system_prompt: Optional[str], max_input_length: int,
318329
max_output_length: int, check_accuracy: bool,
319330
accuracy_threshold: float) -> None:
@@ -326,7 +337,8 @@ def command(ctx, dataset_path: Optional[str], num_samples: int,
326337
num_fewshot=num_fewshot,
327338
random_seed=random_seed,
328339
apply_chat_template=apply_chat_template,
329-
system_prompt=system_prompt)
340+
system_prompt=system_prompt,
341+
chat_template_kwargs=chat_template_kwargs)
330342
accuracy = evaluator.evaluate(llm, sampling_params)
331343
llm.shutdown()
332344

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,3 +221,5 @@ GPT-OSS/MXFP4:
221221
accuracy: 90.3
222222
LGAI-EXAONE/EXAONE-4.0-32B:
223223
- accuracy: 88.36
224+
ByteDance-Seed/Seed-OSS-36B-Instruct:
225+
- accuracy: 90.8

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3656,3 +3656,26 @@ def test_auto_dtype(self):
36563656
kv_cache_config=self.kv_cache_config) as llm:
36573657
task = MMMU(self.MODEL_NAME)
36583658
task.evaluate(llm, sampling_params=self.sampling_params)
3659+
3660+
3661+
class TestSeedOss_36B(LlmapiAccuracyTestHarness):
3662+
MODEL_NAME = "ByteDance-Seed/Seed-OSS-36B-Instruct"
3663+
MODEL_PATH = f"{llm_models_root()}/Seed-OSS/Seed-OSS-36B-Instruct"
3664+
3665+
gsm8k_sampling_params = SamplingParams(temperature=1.1,
3666+
top_p=0.95,
3667+
max_tokens=16384)
3668+
3669+
@skip_pre_hopper
3670+
@pytest.mark.skip_less_device_memory(140000)
3671+
def test_auto_dtype(self):
3672+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
3673+
chat_template_kwargs = dict(thinking_budget=-1)
3674+
3675+
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
3676+
task = GSM8K(self.MODEL_NAME)
3677+
task.evaluate(llm,
3678+
sampling_params=self.gsm8k_sampling_params,
3679+
extra_evaluator_kwargs=dict(
3680+
apply_chat_template=True,
3681+
chat_template_kwargs=chat_template_kwargs))

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,7 @@ accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
609609
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
610610
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
611611
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
612+
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
612613

613614
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
614615
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]

tests/integration/test_lists/qa/llm_function_core_sanity.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-laten
189189
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
190190
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
191191
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
192+
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
192193
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
193194
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
194195
disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,4 @@ l0_b200:
146146
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
147147
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
148148
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
149+
- accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype

0 commit comments

Comments
 (0)