2424 push :
2525 branches :
2626 - master
27- paths : ['.github/workflows/bench.yml ', '**/CMakeLists.txt ', '**/Makefile', '**/*.h ', '**/*.hpp', '**/*. c', '**/*.cpp', '**/*. cu', '**/*.swift', '** /*.m ', 'examples/server/bench/**.* ']
27+ paths : ['llama.cpp ', 'ggml.c ', 'ggml-backend.c ', 'ggml-quants. c', '**/*.cu', 'examples/server /*.h* ', 'examples/server/*.cpp ']
2828 pull_request_target :
2929 types : [opened, synchronize, reopened]
30- paths : ['.github/workflows/bench.yml ', '**/CMakeLists.txt ', '**/Makefile', '**/*.h ', '**/*.hpp', '**/*. c', '**/*.cpp', '**/*. cu', '**/*.swift', '** /*.m ', 'examples/server/bench/**.* ']
30+ paths : ['llama.cpp ', 'ggml.c ', 'ggml-backend.c ', 'ggml-quants. c', '**/*.cu', 'examples/server /*.h* ', 'examples/server/*.cpp ']
3131 schedule :
3232 - cron : ' 04 2 * * *'
3333
4242 RUNNER_LABEL : Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
4343 N_USERS : 8
4444 DURATION : 10m
45+
46+ strategy :
47+ matrix :
48+ model : [phi-2]
49+ ftype : [q4_0, q8_0, f16]
50+ include :
51+ - model : phi-2
52+ ftype : q4_0
53+ pr_comment_enabled : " true"
54+
4555 if : ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
4656 steps :
4757 - name : Clone
@@ -116,7 +126,7 @@ jobs:
116126 --scenario script.js \
117127 --duration ${{ github.event.inputs.duration || env.DURATION }} \
118128 --hf-repo ggml-org/models \
119- --hf-file phi-2 /ggml-model-q4_0 .gguf \
129+ --hf-file ${{ matrix.model }} /ggml-model-${{ matrix.ftype }} .gguf \
120130 --model-path-prefix /models \
121131 --parallel ${{ env.N_USERS }} \
122132 -ngl 33 \
@@ -134,7 +144,7 @@ jobs:
134144
135145 - uses : actions/upload-artifact@v4
136146 with :
137- name : benchmark-results
147+ name : bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
138148 compression-level : 9
139149 path : |
140150 examples/server/bench/*.jpg
@@ -146,7 +156,7 @@ jobs:
146156 with :
147157 authToken : ${{secrets.GITHUB_TOKEN}}
148158 sha : ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
149- context : bench-server-baseline
159+ context : bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
150160 description : |
151161 ${{ env.BENCH_RESULTS }}
152162 state : ' success'
@@ -203,21 +213,26 @@ jobs:
203213 - name : Comment PR
204214 uses : mshick/add-pr-comment@v2
205215 id : comment_pr
206- if : ${{ github.event.pull_request != '' }}
216+ if : ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
207217 with :
208- message-id : bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
218+ message-id : bench-server- ${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
209219 message : |
210- 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
220+ <p align="center">
221+
222+ 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
223+
224+ </p>
225+
226+ <details>
227+
228+ <summary>Expand details for performance related PR only</summary>
211229
212230 - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
213231 - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
214232 - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
215233 - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
216234 - ${{ env.BENCH_GRAPH_XLABEL }}
217235
218- <details>
219-
220- <summary>Time series</summary>
221236
222237 <p align="center">
223238
0 commit comments