@@ -189,12 +189,12 @@ def main(args_in: list[str] | None = None) -> None:
189189 "pp" : {
190190 "p95" : round (data ['metrics' ]["llamacpp_prompt_processing_second" ]["p(95)" ], 2 ),
191191 "avg" : round (data ['metrics' ]["llamacpp_prompt_processing_second" ]["avg" ], 2 ),
192- "0" : round (mean (prometheus_metrics ['prompt_tokens_seconds' ]), 2 ),
192+ "0" : round (mean (prometheus_metrics ['prompt_tokens_seconds' ]), 2 ) if 'prompt_tokens_seconds' in prometheus_metrics else 0 ,
193193 },
194194 "tg" : {
195195 "p95" : round (data ['metrics' ]["llamacpp_tokens_second" ]["p(95)" ], 2 ),
196196 "avg" : round (data ['metrics' ]["llamacpp_tokens_second" ]["avg" ], 2 ),
197- "0" : round (mean (prometheus_metrics ['predicted_tokens_seconds' ]), 2 ),
197+ "0" : round (mean (prometheus_metrics ['predicted_tokens_seconds' ]), 2 ) if 'predicted_tokens_seconds' in prometheus_metrics else 0 ,
198198 },
199199 }
200200 with open ("results.github.env" , 'a' ) as github_env :
@@ -214,11 +214,14 @@ def start_benchmark(args):
214214 k6_args = [
215215 'run' , args .scenario ,
216216 '--no-color' ,
217+ '--no-connection-reuse' ,
218+ '--no-vu-connection-reuse' ,
217219 ]
218220 k6_args .extend (['--duration' , args .duration ])
219221 k6_args .extend (['--iterations' , args .n_prompts ])
220222 k6_args .extend (['--vus' , args .parallel ])
221223 k6_args .extend (['--summary-export' , 'k6-results.json' ])
224+ k6_args .extend (['--out' , 'csv=k6-results.csv' ])
222225 args = f"SERVER_BENCH_N_PROMPTS={ args .n_prompts } SERVER_BENCH_MAX_PROMPT_TOKENS={ args .max_prompt_tokens } SERVER_BENCH_MAX_CONTEXT={ args .max_tokens } "
223226 args = args + ' ' .join ([str (arg ) for arg in [k6_path , * k6_args ]])
224227 print (f"bench: starting k6 with: { args } " )
@@ -231,7 +234,7 @@ def start_server(args):
231234 server_process = start_server_background (args )
232235
233236 attempts = 0
234- max_attempts = 20
237+ max_attempts = 600
235238 if 'GITHUB_ACTIONS' in os .environ :
236239 max_attempts *= 2
237240
@@ -242,7 +245,15 @@ def start_server(args):
242245 print (f"bench: waiting for server to start ..." )
243246 time .sleep (0.5 )
244247
245- print ("bench: server started." )
248+ attempts = 0
249+ while not is_server_ready (args .host , args .port ):
250+ attempts += 1
251+ if attempts > max_attempts :
252+ assert False , "server not ready"
253+ print (f"bench: waiting for server to be ready ..." )
254+ time .sleep (0.5 )
255+
256+ print ("bench: server started and ready." )
246257 return server_process
247258
248259
@@ -255,11 +266,6 @@ def start_server_background(args):
255266 '--host' , args .host ,
256267 '--port' , args .port ,
257268 ]
258- model_file = args .model_path_prefix + os .path .sep + args .hf_file
259- model_dir = os .path .dirname (model_file )
260- if not os .path .exists (model_dir ):
261- os .makedirs (model_dir )
262- server_args .extend (['--model' , model_file ])
263269 server_args .extend (['--hf-repo' , args .hf_repo ])
264270 server_args .extend (['--hf-file' , args .hf_file ])
265271 server_args .extend (['--n-gpu-layers' , args .n_gpu_layers ])
@@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port):
303309 return _is_server_listening
304310
305311
312+ def is_server_ready (server_fqdn , server_port ):
313+ url = f"http://{ server_fqdn } :{ server_port } /health"
314+ response = requests .get (url )
315+ return response .status_code == 200
316+
317+
306318def escape_metric_name (metric_name ):
307319 return re .sub ('[^A-Z0-9]' , '_' , metric_name .upper ())
308320
0 commit comments