Skip to content

Commit e6073b3

Browse files
ruodilLarryXFly
andauthored
[None][test] add gpt oss model for trtllm perf test (#7328)
Signed-off-by: Ruodi Lu <[email protected]> Signed-off-by: Ruodi Lu <[email protected]> Co-authored-by: Ruodi Lu <[email protected]> Co-authored-by: Larry <[email protected]>
1 parent 7801d09 commit e6073b3

File tree

4 files changed

+54
-2
lines changed

4 files changed

+54
-2
lines changed

tests/integration/defs/perf/pytorch_model_config.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,46 @@ def get_model_yaml_config(model_label: str,
166166
]
167167
}
168168
}
169+
},
170+
# GPT-OSS 120B max throughput test
171+
{
172+
'patterns': [
173+
'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256',
174+
'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512',
175+
'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:5120-con:1024',
176+
'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:20480-con:4096'
177+
],
178+
'config': {
179+
'enable_attention_dp': True,
180+
'cuda_graph_config': {
181+
'enable_padding': True,
182+
'max_batch_size': 720,
183+
},
184+
'moe_config': {
185+
'backend': 'CUTLASS'
186+
},
187+
'stream_interval': 10,
188+
'num_postprocess_workers': 4
189+
}
190+
},
191+
# GPT-OSS 120B min latency test
192+
{
193+
'patterns': [
194+
'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1',
195+
'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32'
196+
],
197+
'config': {
198+
'enable_attention_dp': False,
199+
'cuda_graph_config': {
200+
'enable_padding': True,
201+
'max_batch_size': 720,
202+
},
203+
'moe_config': {
204+
'backend': 'TRTLLM'
205+
},
206+
'stream_interval': 10,
207+
'num_postprocess_workers': 4
208+
}
169209
}
170210
]
171211

tests/integration/defs/perf/test_perf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@
130130
"bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct",
131131
"bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8",
132132
"mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503",
133+
"gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
133134
}
134135
# Model PATH of HuggingFace
135136
HF_MODEL_PATH = {

tests/integration/test_lists/qa/llm_perf_cluster.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,3 +144,12 @@ llm_perf_cluster:
144144
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40)
145145
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8]
146146
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:8-ep:8-tp:8-gpus:8]
147+
#gpt_oss_120b
148+
# max throughput test
149+
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256-ep:8-tp:8-gpus:8]
150+
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512-ep:8-tp:8-gpus:8]
151+
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(120)
152+
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:20480-con:4096-ep:8-tp:8-gpus:8] TIMEOUT(180)
153+
# min latency test
154+
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1-ep:8-tp:8-gpus:8]
155+
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8]

tests/integration/test_lists/qa/llm_perf_sanity.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,8 @@ llm_perf_sanity:
172172
#pytorch backend
173173
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
174174
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
175-
- perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
176-
- perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
175+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
176+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
177177

178178
# FP8 tests for systems with 8+ GPUs
179179
- condition:
@@ -215,6 +215,8 @@ llm_perf_sanity:
215215
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128]
216216
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-streaming-float8-input_output_len:128,128]
217217
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500]
218+
# for chunked prefill cases
219+
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200]
218220
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
219221
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8]
220222
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8]

0 commit comments

Comments
 (0)