Skip to content

Commit 1bf38cf

Browse files
committed
server/bench:
- support openAI streaming standard output with [DONE]\n\n - export k6 raw results in csv - fix too many tcp idle connection in tcp_wait - add metric time to emit first token
1 parent 26a8406 commit 1bf38cf

File tree

3 files changed

+21
-6
lines changed

3 files changed

+21
-6
lines changed

examples/server/bench/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/).
66

77
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
88

9-
Example:
9+
Example (assuming golang >= 1.21 is installed):
1010
```shell
1111
go install go.k6.io/xk6/cmd/xk6@latest
12-
xk6 build master \
12+
$GOPATH/bin/xk6 build master \
1313
--with github.com/phymbert/xk6-sse
1414
```
1515

@@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1
3333

3434
Example:
3535
```shell
36-
server --host localhost --port 8080 \
36+
llama-server --host localhost --port 8080 \
3737
--model ggml-model-q4_0.gguf \
3838
--cont-batching \
3939
--metrics \

examples/server/bench/bench.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,11 +214,14 @@ def start_benchmark(args):
214214
k6_args = [
215215
'run', args.scenario,
216216
'--no-color',
217+
'--no-connection-reuse',
218+
'--no-vu-connection-reuse',
217219
]
218220
k6_args.extend(['--duration', args.duration])
219221
k6_args.extend(['--iterations', args.n_prompts])
220222
k6_args.extend(['--vus', args.parallel])
221223
k6_args.extend(['--summary-export', 'k6-results.json'])
224+
k6_args.extend(['--out', 'csv=k6-results.csv'])
222225
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
223226
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
224227
print(f"bench: starting k6 with: {args}")

examples/server/bench/script.js

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
5656

5757
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
5858
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
59+
const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
5960

6061
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
6162
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -89,6 +90,9 @@ export default function () {
8990
],
9091
"model": model,
9192
"stream": true,
93+
"stream_options": {
94+
"include_usage": true, // False to be supported in llama.cpp server
95+
},
9296
"seed": 42,
9397
"max_tokens": max_tokens,
9498
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
@@ -105,12 +109,20 @@ export default function () {
105109
client.on('event', function (event) {
106110
if (promptEvalEndTime == null) {
107111
promptEvalEndTime = new Date()
112+
llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
113+
}
114+
115+
if (event.data === '[DONE]' || event.data === '') {
116+
return
108117
}
109118

110119
let chunk = JSON.parse(event.data)
111-
let choice = chunk.choices[0]
112-
if (choice.finish_reason) {
113-
finish_reason = choice.finish_reason
120+
121+
if (chunk.choices && chunk.choices.length > 0) {
122+
let choice = chunk.choices[0]
123+
if (choice.finish_reason) {
124+
finish_reason = choice.finish_reason
125+
}
114126
}
115127

116128
if (chunk.usage) {

0 commit comments

Comments
 (0)