1212
1313
1414@pytest .fixture (scope = "module" )
15- def model_name ():
16- return "llama-3.1-model/Meta-Llama-3.1-8B"
15+ def model_name (request ):
16+ return request . param
1717
1818
1919@pytest .fixture (scope = "module" )
@@ -23,8 +23,10 @@ def model_path(model_name: str):
2323
2424@pytest .fixture (scope = "module" )
2525def server (model_path : str ):
26+ args = ["--kv_cache_free_gpu_memory_fraction" , "0.8" ]
2627 # fix port to facilitate concise trtllm-serve examples
27- with RemoteOpenAIServer (model_path , port = 8000 ) as remote_server :
28+ with RemoteOpenAIServer (model_path , cli_args = args ,
29+ port = 8000 ) as remote_server :
2830 yield remote_server
2931
3032
@@ -43,18 +45,24 @@ def dataset_path(dataset_name: str):
4345
4446
4547@skip_gpu_memory_less_than_80gb
48+ @pytest .mark .parametrize (
49+ "model_name" , ["llama-3.1-model/Meta-Llama-3.1-8B" , "gpt_oss/gpt-oss-20b" ],
50+ indirect = True )
4651def test_trtllm_serve_benchmark (server : RemoteOpenAIServer , benchmark_root : str ,
4752 model_path : str ):
53+ model_name = model_path .split ("/" )[- 1 ]
4854 client_script = os .path .join (benchmark_root , "benchmark_serving.py" )
4955 dataset = dataset_path ("sharegpt" )
5056 benchmark_cmd = [
5157 "python3" , client_script , "--dataset-name" , "sharegpt" , "--model" ,
52- "llama" , "--dataset-path" , dataset , "--tokenizer" , model_path
58+ model_name , "--dataset-path" , dataset , "--tokenizer" , model_path
5359 ]
5460
5561 # CalledProcessError will be raised if any errors occur
56- subprocess .run (benchmark_cmd ,
57- stdout = subprocess .PIPE ,
58- stderr = subprocess .PIPE ,
59- text = True ,
60- check = True )
62+ result = subprocess .run (benchmark_cmd ,
63+ stdout = subprocess .PIPE ,
64+ stderr = subprocess .PIPE ,
65+ text = True ,
66+ check = True )
67+ assert result .returncode == 0
68+ assert "Serving Benchmark Result" in result .stdout
0 commit comments