Skip to content

Commit b8688a2

Browse files
LLM benchmark script improvements (#427)
* step concurrency * add completion time percentiles * fix up percentiles to be total request time * rename some things * wip percentiles for inter token latency * actually record the numbers * oops * add percentiles for time to first token
1 parent 5b54c29 commit b8688a2

File tree

1 file changed

+43
-3
lines changed

1 file changed

+43
-3
lines changed

scripts/throughput_benchmarks.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,17 @@ def send_request(url, request, user=None):
6565
stream=True,
6666
)
6767
first_line = True
68+
inter_token_latencies = []
69+
last_token_time = None
6870
for byte_payload in response.iter_lines():
71+
token_time = time.time()
6972
if first_line:
70-
time_to_first_token = time.time() - start
73+
time_to_first_token = token_time - start
74+
last_token_time = token_time
7175
first_line = False
76+
else:
77+
inter_token_latencies.append(token_time - last_token_time)
78+
last_token_time = token_time
7279

7380
# Skip line
7481
if byte_payload == b"\n":
@@ -85,6 +92,7 @@ def send_request(url, request, user=None):
8592
"payload": payload_json,
8693
"time_to_first_token": time_to_first_token,
8794
"total_time": time.time() - start,
95+
"inter_token_latencies": inter_token_latencies,
8896
}
8997

9098

@@ -255,7 +263,9 @@ def run_benchmark(
255263
time_to_process_prompt = []
256264
time_per_completion = []
257265
time_to_first_token = []
258-
inter_token_latency = []
266+
inter_token_latency = [] # one value per request, average inter-token latency in the request
267+
total_request_time = []
268+
all_inter_token_latencies = [] # one value per token (except the first generated token)
259269
for result in results:
260270
avg_time_per_token = (result["total_time"] - result["time_to_first_token"]) / (
261271
result["num_completion_tokens"] - 1
@@ -264,28 +274,57 @@ def run_benchmark(
264274
time_to_process_prompt.append(result["time_to_first_token"] - avg_time_per_token)
265275
time_per_completion.append(result["total_time"] - time_to_process_prompt[-1])
266276
inter_token_latency.append(avg_time_per_token)
277+
total_request_time.append(result["total_time"])
278+
all_inter_token_latencies.extend(result["inter_token_latencies"])
267279

268280
total_num_tokens = num_sampled_tokens + num_prompt_tokens
269281
avg_prefill_time = sum(time_to_process_prompt) / n
270282
avg_completion_time = sum(time_per_completion) / n
283+
p50_request_time = np.percentile(total_request_time, 50)
284+
p90_request_time = np.percentile(total_request_time, 90)
285+
p95_request_time = np.percentile(total_request_time, 95)
286+
p99_request_time = np.percentile(total_request_time, 99)
287+
p50_inter_token_latency = np.percentile(all_inter_token_latencies, 50)
288+
p90_inter_token_latency = np.percentile(all_inter_token_latencies, 90)
289+
p95_inter_token_latency = np.percentile(all_inter_token_latencies, 95)
290+
p99_inter_token_latency = np.percentile(all_inter_token_latencies, 99)
291+
p999_inter_token_latency = np.percentile(all_inter_token_latencies, 99.9)
292+
p50_time_to_first_token = np.percentile(time_to_first_token, 50)
293+
p90_time_to_first_token = np.percentile(time_to_first_token, 90)
294+
p95_time_to_first_token = np.percentile(time_to_first_token, 95)
295+
p99_time_to_first_token = np.percentile(time_to_first_token, 99)
271296

272297
statistics = {
273298
"concurrency": concurrency,
274299
"avg_prompt_throughput": num_prompt_tokens
275300
/ (elapsed * avg_prefill_time / (avg_prefill_time + avg_completion_time)),
276301
"avg_time_to_first_token": sum(time_to_first_token) / n,
302+
"p50_time_to_first_token": p50_time_to_first_token,
303+
"p90_time_to_first_token": p90_time_to_first_token,
304+
"p95_time_to_first_token": p95_time_to_first_token,
305+
"p99_time_to_first_token": p99_time_to_first_token,
277306
"avg_sampling_throughput": num_sampled_tokens
278307
/ (elapsed * avg_completion_time / (avg_prefill_time + avg_completion_time)),
279308
"avg_total_throughput": total_num_tokens / elapsed,
280309
"avg_per_session_sampling_throughput": num_sampled_tokens
281310
/ (elapsed * avg_completion_time / (avg_prefill_time + avg_completion_time))
282311
/ concurrency,
312+
"avg_request_throughput": n / elapsed,
283313
"avg_inter_token_latency": sum(inter_token_latency) / n,
314+
"p50_inter_token_latency": p50_inter_token_latency,
315+
"p90_inter_token_latency": p90_inter_token_latency,
316+
"p95_inter_token_latency": p95_inter_token_latency,
317+
"p99_inter_token_latency": p99_inter_token_latency,
318+
"p99.9_inter_token_latency": p999_inter_token_latency,
284319
"num_prompt_tokens": prompt_num_tokens,
285320
"avg_num_sampled_tokens": num_sampled_tokens / n,
286321
"elapsed_time": elapsed,
287322
"avg_prefill_time": avg_prefill_time,
288323
"avg_completion_time": avg_completion_time,
324+
"p50_request_time": p50_request_time,
325+
"p90_request_time": p90_request_time,
326+
"p95_request_time": p95_request_time,
327+
"p99_request_time": p99_request_time,
289328
"num_requests": num_trials,
290329
"num_successful_requests": n,
291330
"total_num_tokens": total_num_tokens,
@@ -361,6 +400,7 @@ def run_benchmarks_concurrency_range(
361400
use_localhost: bool = False,
362401
concurrency_min: int = 1,
363402
concurrency_max: int = 1,
403+
concurrency_step: int = 1,
364404
verbose: bool = False,
365405
hf_model: Optional[str] = None,
366406
local_port: int = 5005,
@@ -369,7 +409,7 @@ def run_benchmarks_concurrency_range(
369409
# Create empty file
370410
with open(output_file, "w"):
371411
pass
372-
for concurrency in range(concurrency_min, concurrency_max + 1):
412+
for concurrency in range(concurrency_min, concurrency_max + 1, concurrency_step):
373413
run_benchmarks(
374414
model,
375415
framework,

0 commit comments

Comments
 (0)