@@ -65,10 +65,17 @@ def send_request(url, request, user=None):
6565 stream = True ,
6666 )
6767 first_line = True
68+ inter_token_latencies = []
69+ last_token_time = None
6870 for byte_payload in response .iter_lines ():
71+ token_time = time .time ()
6972 if first_line :
70- time_to_first_token = time .time () - start
73+ time_to_first_token = token_time - start
74+ last_token_time = token_time
7175 first_line = False
76+ else :
77+ inter_token_latencies .append (token_time - last_token_time )
78+ last_token_time = token_time
7279
7380 # Skip line
7481 if byte_payload == b"\n " :
@@ -85,6 +92,7 @@ def send_request(url, request, user=None):
8592 "payload" : payload_json ,
8693 "time_to_first_token" : time_to_first_token ,
8794 "total_time" : time .time () - start ,
95+ "inter_token_latencies" : inter_token_latencies ,
8896 }
8997
9098
@@ -255,7 +263,9 @@ def run_benchmark(
255263 time_to_process_prompt = []
256264 time_per_completion = []
257265 time_to_first_token = []
258- inter_token_latency = []
266+ inter_token_latency = [] # one value per request, average inter-token latency in the request
267+ total_request_time = []
268+ all_inter_token_latencies = [] # one value per token (except the first generated token)
259269 for result in results :
260270 avg_time_per_token = (result ["total_time" ] - result ["time_to_first_token" ]) / (
261271 result ["num_completion_tokens" ] - 1
@@ -264,28 +274,57 @@ def run_benchmark(
264274 time_to_process_prompt .append (result ["time_to_first_token" ] - avg_time_per_token )
265275 time_per_completion .append (result ["total_time" ] - time_to_process_prompt [- 1 ])
266276 inter_token_latency .append (avg_time_per_token )
277+ total_request_time .append (result ["total_time" ])
278+ all_inter_token_latencies .extend (result ["inter_token_latencies" ])
267279
268280 total_num_tokens = num_sampled_tokens + num_prompt_tokens
269281 avg_prefill_time = sum (time_to_process_prompt ) / n
270282 avg_completion_time = sum (time_per_completion ) / n
283+ p50_request_time = np .percentile (total_request_time , 50 )
284+ p90_request_time = np .percentile (total_request_time , 90 )
285+ p95_request_time = np .percentile (total_request_time , 95 )
286+ p99_request_time = np .percentile (total_request_time , 99 )
287+ p50_inter_token_latency = np .percentile (all_inter_token_latencies , 50 )
288+ p90_inter_token_latency = np .percentile (all_inter_token_latencies , 90 )
289+ p95_inter_token_latency = np .percentile (all_inter_token_latencies , 95 )
290+ p99_inter_token_latency = np .percentile (all_inter_token_latencies , 99 )
291+ p999_inter_token_latency = np .percentile (all_inter_token_latencies , 99.9 )
292+ p50_time_to_first_token = np .percentile (time_to_first_token , 50 )
293+ p90_time_to_first_token = np .percentile (time_to_first_token , 90 )
294+ p95_time_to_first_token = np .percentile (time_to_first_token , 95 )
295+ p99_time_to_first_token = np .percentile (time_to_first_token , 99 )
271296
272297 statistics = {
273298 "concurrency" : concurrency ,
274299 "avg_prompt_throughput" : num_prompt_tokens
275300 / (elapsed * avg_prefill_time / (avg_prefill_time + avg_completion_time )),
276301 "avg_time_to_first_token" : sum (time_to_first_token ) / n ,
302+ "p50_time_to_first_token" : p50_time_to_first_token ,
303+ "p90_time_to_first_token" : p90_time_to_first_token ,
304+ "p95_time_to_first_token" : p95_time_to_first_token ,
305+ "p99_time_to_first_token" : p99_time_to_first_token ,
277306 "avg_sampling_throughput" : num_sampled_tokens
278307 / (elapsed * avg_completion_time / (avg_prefill_time + avg_completion_time )),
279308 "avg_total_throughput" : total_num_tokens / elapsed ,
280309 "avg_per_session_sampling_throughput" : num_sampled_tokens
281310 / (elapsed * avg_completion_time / (avg_prefill_time + avg_completion_time ))
282311 / concurrency ,
312+ "avg_request_throughput" : n / elapsed ,
283313 "avg_inter_token_latency" : sum (inter_token_latency ) / n ,
314+ "p50_inter_token_latency" : p50_inter_token_latency ,
315+ "p90_inter_token_latency" : p90_inter_token_latency ,
316+ "p95_inter_token_latency" : p95_inter_token_latency ,
317+ "p99_inter_token_latency" : p99_inter_token_latency ,
318+ "p99.9_inter_token_latency" : p999_inter_token_latency ,
284319 "num_prompt_tokens" : prompt_num_tokens ,
285320 "avg_num_sampled_tokens" : num_sampled_tokens / n ,
286321 "elapsed_time" : elapsed ,
287322 "avg_prefill_time" : avg_prefill_time ,
288323 "avg_completion_time" : avg_completion_time ,
324+ "p50_request_time" : p50_request_time ,
325+ "p90_request_time" : p90_request_time ,
326+ "p95_request_time" : p95_request_time ,
327+ "p99_request_time" : p99_request_time ,
289328 "num_requests" : num_trials ,
290329 "num_successful_requests" : n ,
291330 "total_num_tokens" : total_num_tokens ,
@@ -361,6 +400,7 @@ def run_benchmarks_concurrency_range(
361400 use_localhost : bool = False ,
362401 concurrency_min : int = 1 ,
363402 concurrency_max : int = 1 ,
403+ concurrency_step : int = 1 ,
364404 verbose : bool = False ,
365405 hf_model : Optional [str ] = None ,
366406 local_port : int = 5005 ,
@@ -369,7 +409,7 @@ def run_benchmarks_concurrency_range(
369409 # Create empty file
370410 with open (output_file , "w" ):
371411 pass
372- for concurrency in range (concurrency_min , concurrency_max + 1 ):
412+ for concurrency in range (concurrency_min , concurrency_max + 1 , concurrency_step ):
373413 run_benchmarks (
374414 model ,
375415 framework ,
0 commit comments