Skip to content

Commit b4d17d1

Browse files
[TRTLLM-8991][test] Add Llama 3.3 70B model with different performance config (#8753)
Signed-off-by: yufeiwu-nv <[email protected]> Co-authored-by: Larry Xu <[email protected]>
1 parent f57dc01 commit b4d17d1

File tree

4 files changed

+58
-16
lines changed

4 files changed

+58
-16
lines changed

tensorrt_llm/bench/benchmark/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def get_llm(runtime_config: RuntimeConfig, kwargs: dict):
105105
"""
106106
llm_cls = LLM
107107

108-
if runtime_config.backend != "tensorrt":
108+
if runtime_config.backend != None:
109109
ignore_trt_only_args(kwargs, runtime_config.backend)
110110

111111
if runtime_config.backend == 'pytorch':

tests/integration/defs/perf/pytorch_model_config.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# limitations under the License.
1515
# -*- coding: utf-8 -*-
1616
"""
17-
Model pytorch yaml config for trtllm-bench perf tests
17+
Model pytorch/TRT yaml config for trtllm-bench perf tests
1818
"""
1919

2020

@@ -36,12 +36,18 @@ def get_model_yaml_config(model_label: str,
3636
Returns:
3737
dict: yaml config
3838
"""
39-
base_config = {
40-
'print_iter_log': True,
41-
'cuda_graph_config': {
42-
'enable_padding': True,
43-
},
44-
}
39+
if 'pytorch' in model_label:
40+
# Pytorch backend config
41+
base_config = {
42+
'print_iter_log': True,
43+
'cuda_graph_config': {
44+
'enable_padding': True,
45+
},
46+
}
47+
else:
48+
# TRT backend config
49+
base_config = {}
50+
4551
if 'kv_cache_dtype' in model_label:
4652
base_config.update({
4753
'kv_cache_dtype':
@@ -241,6 +247,19 @@ def get_model_yaml_config(model_label: str,
241247
'config': {
242248
'enable_chunked_prefill': True,
243249
}
250+
},
251+
# Llama-v3.3 models with xgrammar guided decoding
252+
{
253+
'patterns': [
254+
"llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra"
255+
],
256+
'config': {
257+
'extended_runtime_perf_knob_config': {
258+
'cuda_graph_cache_size': 1.0,
259+
'cuda_graph_mode': True,
260+
},
261+
'guided_decoding_backend': 'xgrammar'
262+
}
244263
}
245264
]
246265

@@ -251,7 +270,8 @@ def get_model_yaml_config(model_label: str,
251270
patterns = [patterns]
252271
for pattern in patterns:
253272
if pattern in model_label.lower():
254-
recursive_update(base_config, pattern_config['config'])
273+
if pattern_config.get('config'):
274+
recursive_update(base_config, pattern_config['config'])
255275
break # Stop checking other patterns for this config once we find a match
256276

257277
# lora-specific change for pytorch

tests/integration/defs/perf/test_perf.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@
5757
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
5858
"llama_v3.3_70b_instruct_fp4":
5959
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
60-
"llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
6160
"llama_v3.1_405b_instruct_fp8":
6261
"llama-3.1-model/Llama-3.1-405B-Instruct-FP8",
6362
"llama_v3.1_405b_instruct_fp4":
@@ -783,6 +782,8 @@ def __init__(
783782
tp_size: int = 1,
784783
pp_size: int = 1,
785784
num_gpus: int = 1,
785+
# only for torch-backend currently
786+
extra: bool = False,
786787
# _autodeploy backend specific parameters
787788
ad_compile_backend: str = "torch-opt",
788789
free_mem_ratio: float = 0.9,
@@ -841,6 +842,8 @@ def __init__(
841842
self.pp_size = pp_size
842843
# Number of GPUs.
843844
self.num_gpus = num_gpus
845+
# Extra flag to enable pytorch_model_config reading for TRT backend
846+
self.extra = extra
844847
# _autodeploy backend specific parameters
845848
self.ad_compile_backend = ad_compile_backend
846849
self.free_mem_ratio = free_mem_ratio
@@ -1016,6 +1019,10 @@ def to_string(self,
10161019
if self.num_gpus > 1:
10171020
entries.append(f"gpus:{self.num_gpus}")
10181021

1022+
# Add extra flag for llm-api-config.yml.
1023+
if self.extra:
1024+
entries.append("extra")
1025+
10191026
# Concatenate labels with "-".
10201027
return "-".join(entries)
10211028

@@ -1180,6 +1187,11 @@ def load_from_str(self, test_param_labels) -> None:
11801187
self.num_gpus = 1 if not labels[0].startswith("gpus:") else int(
11811188
labels.pop(0).replace("gpus:", ""))
11821189

1190+
if len(labels) > 0:
1191+
self.extra = True if labels[0] == "extra" else False
1192+
if self.extra:
1193+
labels.pop(0)
1194+
11831195
assert len(
11841196
labels
11851197
) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}"
@@ -1644,18 +1656,26 @@ def get_trtllm_bench_command(self, engine_dir):
16441656
benchmark_cmd += [f"--pp={self._config.pp_size}"]
16451657
if self._config.streaming == "streaming":
16461658
benchmark_cmd += [f"--streaming"]
1647-
#use default yaml config
1648-
if self._config.backend == "pytorch":
1659+
1660+
#Add extra-llm-api-config.yml for pytorch backend and tensorrt backend with extra flag
1661+
if self._config.backend == "pytorch" or (self._config.backend == ""
1662+
and self._config.extra):
16491663
pytorch_config_path = os.path.join(engine_dir,
16501664
"extra-llm-api-config.yml")
16511665
if not os.path.exists(pytorch_config_path):
16521666
os.makedirs(os.path.dirname(pytorch_config_path), exist_ok=True)
16531667
config = get_model_yaml_config(self._config.to_string(),
16541668
lora_dirs=self.lora_dirs)
1655-
print_info(f"pytorch model config: {config}")
1656-
with open(pytorch_config_path, 'w') as f:
1657-
yaml.dump(config, f, default_flow_style=False)
1658-
benchmark_cmd += [f"--extra_llm_api_options={pytorch_config_path}"]
1669+
if config:
1670+
print_info(f"pytorch/TRT model config: {config}")
1671+
with open(pytorch_config_path, 'w') as f:
1672+
yaml.dump(config, f, default_flow_style=False)
1673+
benchmark_cmd += [
1674+
f"--extra_llm_api_options={pytorch_config_path}"
1675+
]
1676+
# If guided_decoding_backend is set, we need to initialize tokenizer
1677+
if config.get('guided_decoding_backend') is not None:
1678+
benchmark_cmd += ["--no_skip_tokenizer_init"]
16591679
elif self._config.backend == "_autodeploy":
16601680
autodeploy_config_path = os.path.join(engine_dir,
16611681
"extra_llm_api_options.yaml")

tests/integration/test_lists/qa/llm_perf_nim.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,8 @@ llm_perf_nim:
392392
#trt backend
393393
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-gpus:8]
394394
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:512,32-gpus:8]
395+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8]
396+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra]
395397
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-streaming-float8-maxbs:16-input_output_len:512,32-gpus:8]
396398
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1-gpus:8]
397399
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:64-con:250-gpus:8]

0 commit comments

Comments
 (0)