tests: [TRTQA-2906] add benchmark serving tests (NVIDIA#4901)

xinhe-nv · LarryXFly · web-flow · commit 1c3091c63b47 · 2025-06-05T14:33:03.000+08:00
Signed-off-by: xinhe-nv &lt;200704525+xinhe-nv@users.noreply.github.com&gt;
Co-authored-by: Larry &lt;197874197+LarryXFly@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1433,6 +1433,20 @@ def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv):
     ])
 
 
+@pytest.mark.skip_less_device_memory(80000)
+def test_trtllm_benchmark_serving(llm_root, llm_venv):
+    example_root = Path(os.path.join(llm_root, "examples", "apps"))
+    test_root = unittest_path() / "llmapi" / "apps"
+    llm_venv.run_cmd([
+        "-m", "pip", "install", "-r",
+        os.path.join(example_root, "requirements.txt")
+    ])
+
+    llm_venv.run_cmd(
+        ["-m", "pytest",
+         str(test_root / "_test_trtllm_serve_benchmark.py")])
+
+
 def test_build_time_benchmark_sanity(llm_root, llm_venv):
     temp = tempfile.TemporaryDirectory()
     llm_venv.run_cmd([
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -490,6 +490,7 @@ test_e2e.py::test_mistral_e2e[use_py_session---]
 test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B]
 test_e2e.py::test_openai_multi_chat_example
 test_e2e.py::test_openai_consistent_chat
+test_e2e.py::test_trtllm_benchmark_serving
 llmapi/test_llm_examples.py::test_llmapi_server_example
 # Pivot to Pytorch test cases.
 test_e2e.py::test_ptp_quickstart
diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py
@@ -0,0 +1,60 @@
+import os
+import subprocess
+import sys
+
+import pytest
+from utils.util import skip_gpu_memory_less_than_80gb
+
+from .openai_server import RemoteOpenAIServer
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from test_llm import get_model_path
+
+
+@pytest.fixture(scope="module")
+def model_name():
+    return "llama-3.1-model/Meta-Llama-3.1-8B"
+
+
+@pytest.fixture(scope="module")
+def model_path(model_name: str):
+    return get_model_path(model_name)
+
+
+@pytest.fixture(scope="module")
+def server(model_path: str):
+    # fix port to facilitate concise trtllm-serve examples
+    with RemoteOpenAIServer(model_path, port=8000) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def benchmark_root():
+    llm_root = os.getenv("LLM_ROOT")
+    return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts")
+
+
+def dataset_path(dataset_name: str):
+    if dataset_name == "sharegpt":
+        return get_model_path(
+            "datasets/ShareGPT_V3_unfiltered_cleaned_split.json")
+    else:
+        raise ValueError(f"Invalid dataset name: {dataset_name}")
+
+
+@skip_gpu_memory_less_than_80gb
+def test_trtllm_serve_benchmark(server: RemoteOpenAIServer, benchmark_root: str,
+                                model_path: str):
+    client_script = os.path.join(benchmark_root, "benchmark_serving.py")
+    dataset = dataset_path("sharegpt")
+    benchmark_cmd = [
+        "python3", client_script, "--dataset-name", "sharegpt", "--model",
+        "llama", "--dataset-path", dataset, "--tokenizer", model_path
+    ]
+
+    # CalledProcessError will be raised if any errors occur
+    subprocess.run(benchmark_cmd,
+                   stdout=subprocess.PIPE,
+                   stderr=subprocess.PIPE,
+                   text=True,
+                   check=True)