Skip to content

Commit 1fbea49

Browse files
authored
[TRTLLM-7070][feat] add gpt-oss serve benchmark tests (#7638)
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent 750d15b commit 1fbea49

File tree

7 files changed

+37
-21
lines changed

7 files changed

+37
-21
lines changed

tests/integration/defs/test_e2e.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1735,11 +1735,16 @@ def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv):
17351735

17361736

17371737
@pytest.mark.skip_less_device_memory(80000)
1738-
def test_trtllm_benchmark_serving(llm_root, llm_venv):
1738+
@pytest.mark.parametrize(
1739+
"model_name", ["llama-3.1-model/Meta-Llama-3.1-8B", "gpt_oss/gpt-oss-20b"])
1740+
def test_trtllm_benchmark_serving(llm_venv, model_name):
17391741
test_root = unittest_path() / "llmapi" / "apps"
1740-
llm_venv.run_cmd(
1741-
["-m", "pytest",
1742-
str(test_root / "_test_trtllm_serve_benchmark.py")])
1742+
llm_venv.run_cmd([
1743+
"-m", "pytest",
1744+
str(test_root /
1745+
f"_test_trtllm_serve_benchmark.py::test_trtllm_serve_benchmark[{model_name}]"
1746+
)
1747+
])
17431748

17441749

17451750
def test_build_time_benchmark_sanity(llm_root, llm_venv):

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -610,7 +610,8 @@ test_e2e.py::test_mistral_e2e[use_py_session---]
610610
test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B]
611611
test_e2e.py::test_openai_multi_chat_example
612612
test_e2e.py::test_openai_consistent_chat
613-
test_e2e.py::test_trtllm_benchmark_serving
613+
test_e2e.py::test_trtllm_benchmark_serving[llama-3.1-model/Meta-Llama-3.1-8B]
614+
test_e2e.py::test_trtllm_benchmark_serving[gpt_oss/gpt-oss-20b]
614615
test_e2e.py::test_trtllm_multimodal_benchmark_serving
615616

616617
llmapi/test_llm_examples.py::test_llmapi_server_example

tests/integration/test_lists/qa/llm_function_core_sanity.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,5 +235,5 @@ test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Me
235235
test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B]
236236
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
237237
test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-hf-nvfp4-False-False]
238-
test_e2e.py::test_trtllm_benchmark_serving
238+
test_e2e.py::test_trtllm_benchmark_serving[gpt_oss/gpt-oss-20b]
239239
test_e2e.py::test_trtllm_multimodal_benchmark_serving

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,3 +172,4 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas
172172
test_e2e.py::test_ptp_quickstart_advanced[Nemotron4_4B-BF16-nemotron/Minitron-4B-Base]
173173
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-H-8B-Nemotron-H-8B-Base-8K]
174174
test_e2e.py::test_ptp_quickstart_advanced_8gpus[DeepSeek-V3-671B-FP8-DeepSeek-V3-0324]
175+
test_e2e.py::test_trtllm_benchmark_serving[gpt_oss/gpt-oss-20b]

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ l0_h100:
109109
- test_e2e.py::test_openai_chat_harmony
110110
- test_e2e.py::test_openai_responses
111111
- test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True] TIMEOUT (90)
112+
- test_e2e.py::test_trtllm_benchmark_serving[llama-3.1-model/Meta-Llama-3.1-8B]
112113
# ------------- AutoDeploy tests ---------------
113114
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype
114115
- condition:

tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212

1313

1414
@pytest.fixture(scope="module")
15-
def model_name():
16-
return "llama-3.1-model/Meta-Llama-3.1-8B"
15+
def model_name(request):
16+
return request.param
1717

1818

1919
@pytest.fixture(scope="module")
@@ -23,8 +23,10 @@ def model_path(model_name: str):
2323

2424
@pytest.fixture(scope="module")
2525
def server(model_path: str):
26+
args = ["--kv_cache_free_gpu_memory_fraction", "0.8"]
2627
# fix port to facilitate concise trtllm-serve examples
27-
with RemoteOpenAIServer(model_path, port=8000) as remote_server:
28+
with RemoteOpenAIServer(model_path, cli_args=args,
29+
port=8000) as remote_server:
2830
yield remote_server
2931

3032

@@ -43,18 +45,24 @@ def dataset_path(dataset_name: str):
4345

4446

4547
@skip_gpu_memory_less_than_80gb
48+
@pytest.mark.parametrize(
49+
"model_name", ["llama-3.1-model/Meta-Llama-3.1-8B", "gpt_oss/gpt-oss-20b"],
50+
indirect=True)
4651
def test_trtllm_serve_benchmark(server: RemoteOpenAIServer, benchmark_root: str,
4752
model_path: str):
53+
model_name = model_path.split("/")[-1]
4854
client_script = os.path.join(benchmark_root, "benchmark_serving.py")
4955
dataset = dataset_path("sharegpt")
5056
benchmark_cmd = [
5157
"python3", client_script, "--dataset-name", "sharegpt", "--model",
52-
"llama", "--dataset-path", dataset, "--tokenizer", model_path
58+
model_name, "--dataset-path", dataset, "--tokenizer", model_path
5359
]
5460

5561
# CalledProcessError will be raised if any errors occur
56-
subprocess.run(benchmark_cmd,
57-
stdout=subprocess.PIPE,
58-
stderr=subprocess.PIPE,
59-
text=True,
60-
check=True)
62+
result = subprocess.run(benchmark_cmd,
63+
stdout=subprocess.PIPE,
64+
stderr=subprocess.PIPE,
65+
text=True,
66+
check=True)
67+
assert result.returncode == 0
68+
assert "Serving Benchmark Result" in result.stdout

tests/unittest/utils/util.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -185,14 +185,14 @@ def skip_gpu_memory_less_than(required_memory: int):
185185
)
186186

187187

188-
skip_gpu_memory_less_than_40gb = skip_gpu_memory_less_than(40 * 1024 * 1024 *
189-
1024)
188+
skip_gpu_memory_less_than_40gb = skip_gpu_memory_less_than(40 * 1000 * 1000 *
189+
1000)
190190

191-
skip_gpu_memory_less_than_80gb = skip_gpu_memory_less_than(80 * 1024 * 1024 *
192-
1024)
191+
skip_gpu_memory_less_than_80gb = skip_gpu_memory_less_than(80 * 1000 * 1000 *
192+
1000)
193193

194-
skip_gpu_memory_less_than_138gb = skip_gpu_memory_less_than(138 * 1024 * 1024 *
195-
1024)
194+
skip_gpu_memory_less_than_138gb = skip_gpu_memory_less_than(138 * 1000 * 1000 *
195+
1000)
196196

197197

198198
def modelopt_installed():

0 commit comments

Comments
 (0)