@@ -28,6 +28,7 @@ options:
2828 -p, --n-prompt <n> (default: 512)
2929 -n, --n-gen <n> (default: 128)
3030 -pg <pp,tg> (default: )
31+ -d, --n-depth <n> (default: 0)
3132 -b, --batch-size <n> (default: 2048)
3233 -ub, --ubatch-size <n> (default: 512)
3334 -ctk, --cache-type-k <t> (default: f16)
@@ -66,6 +67,8 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple
6667
6768Each test is repeated the number of times given by ` -r ` , and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
6869
70+ Using the ` -d <n> ` option, each test can be run at a specified context depth, prefilling the KV cache with ` <n> ` tokens.
71+
6972For a description of the other options, see the [ main example] ( ../main/README.md ) .
7073
7174Note:
@@ -148,6 +151,19 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
148151| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
149152| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
150153
154+ ### Different prefilled context
155+
156+ ```
157+ $ ./llama-bench -d 0,512
158+ ```
159+
160+ | model | size | params | backend | ngl | test | t/s |
161+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
162+ | qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 |
163+ | qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 |
164+ | qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 |
165+ | qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 |
166+
151167## Output formats
152168
153169By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the ` -o ` option.
@@ -170,9 +186,9 @@ $ ./llama-bench -o csv
170186```
171187
172188``` csv
173- build_commit,build_number,cuda,metal,gpu_blas,blas, cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv, n_gpu_layers,main_gpu,mul_mat_q, tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
174- "3469684 ","1275 ","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K ","NVIDIA GeForce RTX 3090 Ti ","models/7B/ggml-model-q4_0 .gguf","llama 7B mostly Q4_0 ","3825065984 ","6738415616 ","512","16 ","1 ","99","0","1 ","0.00","512","0","2023-09-23T12:09:01Z ","212155977 ","732372 ","2413.341687 ","8.305961 "
175- "3469684 ","1275 ","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K ","NVIDIA GeForce RTX 3090 Ti ","models/7B/ggml-model-q4_0 .gguf","llama 7B mostly Q4_0 ","3825065984 ","6738415616 ","512","16 ","1 ","99","0","1 ","0.00","0","128","2023-09-23T12:09:02Z ","969320879 ","2728399 ","132.052051 ","0.371342 "
189+ build_commit,build_number,cpu_info,gpu_info,backends, model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch, n_threads,cpu_mask,cpu_strict,poll,type_k,type_v, n_gpu_layers,split_mode, main_gpu,no_kv_offload,flash_attn, tensor_split,use_mmap,embeddings, n_prompt,n_gen,n_depth ,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
190+ "8cf427ff ","5163 ","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080 ","CUDA"," models/Qwen2.5-7B-Instruct-Q4_K_M .gguf","qwen2 7B Q4_K - Medium ","4677120000 ","7615616512 ","2048"," 512","8 ","0x0 ","0","50","f16","f16"," 99","layer"," 0","0 ","0","0 .00","1","0"," 512","0","0","2025-04-24T11:57:09Z ","70285660 ","982040 ","7285.676949 ","100.064434 "
191+ "8cf427ff ","5163 ","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080 ","CUDA"," models/Qwen2.5-7B-Instruct-Q4_K_M .gguf","qwen2 7B Q4_K - Medium ","4677120000 ","7615616512 ","2048"," 512","8 ","0x0 ","0","50","f16","f16"," 99","layer"," 0","0 ","0","0 .00","1"," 0","0"," 128","0","2025-04-24T11:57:10Z ","1067431600 ","3834831 ","119.915244 ","0.430617 "
176192```
177193
178194### JSON
@@ -184,64 +200,78 @@ $ ./llama-bench -o json
184200``` json
185201[
186202 {
187- "build_commit" : " 3469684" ,
188- "build_number" : 1275 ,
189- "cuda" : true ,
190- "metal" : false ,
191- "gpu_blas" : true ,
192- "blas" : true ,
193- "cpu_info" : " 13th Gen Intel(R) Core(TM) i9-13900K" ,
194- "gpu_info" : " NVIDIA GeForce RTX 3090 Ti" ,
195- "model_filename" : " models/7B/ggml-model-q4_0.gguf" ,
196- "model_type" : " llama 7B mostly Q4_0" ,
197- "model_size" : 3825065984 ,
198- "model_n_params" : 6738415616 ,
199- "n_batch" : 512 ,
200- "n_threads" : 16 ,
201- "f16_kv" : true ,
203+ "build_commit" : " 8cf427ff" ,
204+ "build_number" : 5163 ,
205+ "cpu_info" : " AMD Ryzen 7 7800X3D 8-Core Processor" ,
206+ "gpu_info" : " NVIDIA GeForce RTX 4080" ,
207+ "backends" : " CUDA" ,
208+ "model_filename" : " models/Qwen2.5-7B-Instruct-Q4_K_M.gguf" ,
209+ "model_type" : " qwen2 7B Q4_K - Medium" ,
210+ "model_size" : 4677120000 ,
211+ "model_n_params" : 7615616512 ,
212+ "n_batch" : 2048 ,
213+ "n_ubatch" : 512 ,
214+ "n_threads" : 8 ,
215+ "cpu_mask" : " 0x0" ,
216+ "cpu_strict" : false ,
217+ "poll" : 50 ,
218+ "type_k" : " f16" ,
219+ "type_v" : " f16" ,
202220 "n_gpu_layers" : 99 ,
221+ "split_mode" : " layer" ,
203222 "main_gpu" : 0 ,
204- "mul_mat_q" : true ,
223+ "no_kv_offload" : false ,
224+ "flash_attn" : false ,
205225 "tensor_split" : " 0.00" ,
226+ "use_mmap" : true ,
227+ "embeddings" : false ,
206228 "n_prompt" : 512 ,
207229 "n_gen" : 0 ,
208- "test_time" : " 2023-09-23T12:09:57Z" ,
209- "avg_ns" : 212365953 ,
210- "stddev_ns" : 985423 ,
211- "avg_ts" : 2410.974041 ,
212- "stddev_ts" : 11.163766 ,
213- "samples_ns" : [ 213837238 , 211635853 , 212328053 , 211329715 , 212698907 ],
214- "samples_ts" : [ 2394.34 , 2419.25 , 2411.36 , 2422.75 , 2407.16 ]
230+ "n_depth" : 0 ,
231+ "test_time" : " 2025-04-24T11:58:50Z" ,
232+ "avg_ns" : 72135640 ,
233+ "stddev_ns" : 1453752 ,
234+ "avg_ts" : 7100.002165 ,
235+ "stddev_ts" : 140.341520 ,
236+ "samples_ns" : [ 74601900 , 71632900 , 71745200 , 71952700 , 70745500 ],
237+ "samples_ts" : [ 6863.1 , 7147.55 , 7136.37 , 7115.79 , 7237.21 ]
215238 },
216239 {
217- "build_commit" : " 3469684" ,
218- "build_number" : 1275 ,
219- "cuda" : true ,
220- "metal" : false ,
221- "gpu_blas" : true ,
222- "blas" : true ,
223- "cpu_info" : " 13th Gen Intel(R) Core(TM) i9-13900K" ,
224- "gpu_info" : " NVIDIA GeForce RTX 3090 Ti" ,
225- "model_filename" : " models/7B/ggml-model-q4_0.gguf" ,
226- "model_type" : " llama 7B mostly Q4_0" ,
227- "model_size" : 3825065984 ,
228- "model_n_params" : 6738415616 ,
229- "n_batch" : 512 ,
230- "n_threads" : 16 ,
231- "f16_kv" : true ,
240+ "build_commit" : " 8cf427ff" ,
241+ "build_number" : 5163 ,
242+ "cpu_info" : " AMD Ryzen 7 7800X3D 8-Core Processor" ,
243+ "gpu_info" : " NVIDIA GeForce RTX 4080" ,
244+ "backends" : " CUDA" ,
245+ "model_filename" : " models/Qwen2.5-7B-Instruct-Q4_K_M.gguf" ,
246+ "model_type" : " qwen2 7B Q4_K - Medium" ,
247+ "model_size" : 4677120000 ,
248+ "model_n_params" : 7615616512 ,
249+ "n_batch" : 2048 ,
250+ "n_ubatch" : 512 ,
251+ "n_threads" : 8 ,
252+ "cpu_mask" : " 0x0" ,
253+ "cpu_strict" : false ,
254+ "poll" : 50 ,
255+ "type_k" : " f16" ,
256+ "type_v" : " f16" ,
232257 "n_gpu_layers" : 99 ,
258+ "split_mode" : " layer" ,
233259 "main_gpu" : 0 ,
234- "mul_mat_q" : true ,
260+ "no_kv_offload" : false ,
261+ "flash_attn" : false ,
235262 "tensor_split" : " 0.00" ,
263+ "use_mmap" : true ,
264+ "embeddings" : false ,
236265 "n_prompt" : 0 ,
237266 "n_gen" : 128 ,
238- "test_time" : " 2023-09-23T12:09:59Z" ,
239- "avg_ns" : 977425219 ,
240- "stddev_ns" : 9268593 ,
241- "avg_ts" : 130.965708 ,
242- "stddev_ts" : 1.238924 ,
243- "samples_ns" : [ 984472709 , 974901233 , 989474741 , 970729355 , 967548060 ],
244- "samples_ts" : [ 130.019 , 131.295 , 129.362 , 131.86 , 132.293 ]
267+ "n_depth" : 0 ,
268+ "test_time" : " 2025-04-24T11:58:51Z" ,
269+ "avg_ns" : 1076767880 ,
270+ "stddev_ns" : 9449585 ,
271+ "avg_ts" : 118.881588 ,
272+ "stddev_ts" : 1.041811 ,
273+ "samples_ns" : [ 1075361300 , 1065089400 , 1071761200 , 1081934900 , 1089692600 ],
274+ "samples_ts" : [ 119.03 , 120.178 , 119.43 , 118.307 , 117.464 ]
245275 }
246276]
247277```
@@ -254,8 +284,8 @@ $ ./llama-bench -o jsonl
254284```
255285
256286``` json lines
257- {"build_commit" :" 3469684 " , "build_number" :1275 , "cuda" : true , "metal" : false , "gpu_blas" : true , "blas" : true , " cpu_info" :" 13th Gen Intel(R) Core(TM) i9-13900K " , "gpu_info" :" NVIDIA GeForce RTX 3090 Ti " , "model_filename" :" models/7B/ggml-model-q4_0 .gguf" ,"model_type" :" llama 7B mostly Q4_0 " , "model_size" :3825065984 , "model_n_params" :6738415616 , "n_batch" :512 ,"n_threads" :16 , "f16_kv" : true , " n_gpu_layers" :99 ," main_gpu" :0 , "mul_mat_q" : true , " tensor_split" :" 0.00" ," n_prompt" :512 ,"n_gen" :0 , " test_time" :" 2023-09-23T12:09:57Z " , "avg_ns" :212365953 , "stddev_ns" :985423 , "avg_ts" :2410.974041 , "stddev_ts" :11.163766 , "samples_ns" :[ 213837238 , 211635853 , 212328053 , 211329715 , 212698907 ],"samples_ts" :[ 2394.34 , 2419.25 , 2411.36 , 2422.75 , 2407.16 ]}
258- {"build_commit" :" 3469684 " , "build_number" :1275 , "cuda" : true , "metal" : false , "gpu_blas" : true , "blas" : true , " cpu_info" :" 13th Gen Intel(R) Core(TM) i9-13900K " , "gpu_info" :" NVIDIA GeForce RTX 3090 Ti " , "model_filename" :" models/7B/ggml-model-q4_0 .gguf" ,"model_type" :" llama 7B mostly Q4_0 " , "model_size" :3825065984 , "model_n_params" :6738415616 , "n_batch" :512 ,"n_threads" :16 , "f16_kv" : true , " n_gpu_layers" :99 ," main_gpu" :0 , "mul_mat_q" : true , " tensor_split" :" 0.00" ," n_prompt" :0 , "n_gen" :128 ," test_time" :" 2023-09-23T12:09:59Z " , "avg_ns" :977425219 , "stddev_ns" :9268593 , "avg_ts" :130.965708 , "stddev_ts" :1.238924 , "samples_ns" :[ 984472709 , 974901233 , 989474741 , 970729355 , 967548060 ],"samples_ts" :[ 130.019 , 131.295 , 129.362 , 131.86 , 132.293 ]}
287+ {"build_commit" : " 8cf427ff " , "build_number" : 5163 , " cpu_info" : " AMD Ryzen 7 7800X3D 8-Core Processor " , "gpu_info" : " NVIDIA GeForce RTX 4080 " , "backends" : " CUDA " , "model_filename" : " models/Qwen2.5-7B-Instruct-Q4_K_M .gguf" , "model_type" : " qwen2 7B Q4_K - Medium " , "model_size" : 4677120000 , "model_n_params" : 7615616512 , "n_batch" : 2048 , "n_ubatch" : 512 , "n_threads" : 8 , "cpu_mask" : " 0x0 " , "cpu_strict" : false , "poll" : 50 , "type_k" : " f16 " , "type_v" : " f16 " , " n_gpu_layers" : 99 , "split_mode" : " layer " , " main_gpu" : 0 , "no_kv_offload" : false , "flash_attn" : false , " tensor_split" : " 0.00" , "use_mmap" : true , "embeddings" : false , " n_prompt" : 512 , "n_gen" : 0 , "n_depth" : 0 , " test_time" : " 2025-04-24T11:59:33Z " , "avg_ns" : 70497220 , "stddev_ns" : 883196 , "avg_ts" : 7263.609157 , "stddev_ts" : 90.940578 , "samples_ns" : [ 71551000 , 71222800 , 70364100 , 69439100 , 69909100 ],"samples_ts" : [ 7155.74 , 7188.71 , 7276.44 , 7373.37 , 7323.8 ]}
288+ {"build_commit" : " 8cf427ff " , "build_number" : 5163 , " cpu_info" : " AMD Ryzen 7 7800X3D 8-Core Processor " , "gpu_info" : " NVIDIA GeForce RTX 4080 " , "backends" : " CUDA " , "model_filename" : " models/Qwen2.5-7B-Instruct-Q4_K_M .gguf" , "model_type" : " qwen2 7B Q4_K - Medium " , "model_size" : 4677120000 , "model_n_params" : 7615616512 , "n_batch" : 2048 , "n_ubatch" : 512 , "n_threads" : 8 , "cpu_mask" : " 0x0 " , "cpu_strict" : false , "poll" : 50 , "type_k" : " f16 " , "type_v" : " f16 " , " n_gpu_layers" : 99 , "split_mode" : " layer " , " main_gpu" : 0 , "no_kv_offload" : false , "flash_attn" : false , " tensor_split" : " 0.00" , "use_mmap" : true , "embeddings" : false , " n_prompt" : 0 , "n_gen" : 128 , "n_depth" : 0 , " test_time" : " 2025-04-24T11:59:33Z " , "avg_ns" : 1068078400 , "stddev_ns" : 6279455 , "avg_ts" : 119.844681 , "stddev_ts" : 0.699739 , "samples_ns" : [ 1066331700 , 1064864900 , 1079042600 , 1063328400 , 1066824400 ],"samples_ts" : [ 120.038 , 120.203 , 118.624 , 120.377 , 119.982 ]}
259289```
260290
261291
@@ -271,32 +301,39 @@ $ ./llama-bench -o sql
271301CREATE TABLE IF NOT EXISTS test (
272302 build_commit TEXT ,
273303 build_number INTEGER ,
274- cuda INTEGER ,
275- metal INTEGER ,
276- gpu_blas INTEGER ,
277- blas INTEGER ,
278304 cpu_info TEXT ,
279305 gpu_info TEXT ,
306+ backends TEXT ,
280307 model_filename TEXT ,
281308 model_type TEXT ,
282309 model_size INTEGER ,
283310 model_n_params INTEGER ,
284311 n_batch INTEGER ,
312+ n_ubatch INTEGER ,
285313 n_threads INTEGER ,
286- f16_kv INTEGER ,
314+ cpu_mask TEXT ,
315+ cpu_strict INTEGER ,
316+ poll INTEGER ,
317+ type_k TEXT ,
318+ type_v TEXT ,
287319 n_gpu_layers INTEGER ,
320+ split_mode TEXT ,
288321 main_gpu INTEGER ,
289- mul_mat_q INTEGER ,
322+ no_kv_offload INTEGER ,
323+ flash_attn INTEGER ,
290324 tensor_split TEXT ,
325+ use_mmap INTEGER ,
326+ embeddings INTEGER ,
291327 n_prompt INTEGER ,
292328 n_gen INTEGER ,
329+ n_depth INTEGER ,
293330 test_time TEXT ,
294331 avg_ns INTEGER ,
295332 stddev_ns INTEGER ,
296333 avg_ts REAL ,
297334 stddev_ts REAL
298335);
299336
300- INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES (' 3469684 ' , ' 1275 ' , ' 1 ' , ' 0 ' , ' 0 ' , ' 1 ' , ' 1 ' , ' 13th Gen Intel(R) Core(TM) i9-13900K ' , ' NVIDIA GeForce RTX 3090 Ti ' , ' models/7B/ggml-model-q4_0 .gguf' , ' llama 7B mostly Q4_0 ' , ' 3825065984 ' , ' 6738415616 ' , ' 512' , ' 16 ' , ' 1 ' , ' 99' , ' 0' , ' 1 ' , ' 0.00' , ' 512' , ' 0' , ' 2023-09-23T12:10:30Z ' , ' 212693772 ' , ' 743623 ' , ' 2407.240204 ' , ' 8.409634 ' );
301- INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES (' 3469684 ' , ' 1275 ' , ' 1 ' , ' 0 ' , ' 0 ' , ' 1 ' , ' 1 ' , ' 13th Gen Intel(R) Core(TM) i9-13900K ' , ' NVIDIA GeForce RTX 3090 Ti ' , ' models/7B/ggml-model-q4_0 .gguf' , ' llama 7B mostly Q4_0 ' , ' 3825065984 ' , ' 6738415616 ' , ' 512' , ' 16 ' , ' 1 ' , ' 99' , ' 0' , ' 1 ' , ' 0.00' , ' 0' , ' 128' , ' 2023-09-23T12:10:31Z ' , ' 977925003 ' , ' 4037361 ' , ' 130.891159 ' , ' 0.537692 ' );
337+ INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES (' 8cf427ff ' , ' 5163 ' , ' AMD Ryzen 7 7800X3D 8- Core Processor ' , ' NVIDIA GeForce RTX 4080 ' , ' CUDA ' , ' models/Qwen2.5-7B-Instruct-Q4_K_M .gguf' , ' qwen2 7B Q4_K - Medium ' , ' 4677120000 ' , ' 7615616512 ' , ' 2048 ' , ' 512' , ' 8 ' , ' 0x0 ' , ' 0 ' , ' 50 ' , ' f16 ' , ' f16 ' , ' 99' , ' layer ' , ' 0' , ' 0 ' , ' 0' , ' 0 .00' , ' 1 ' , ' 0 ' , ' 512' , ' 0' , ' 0 ' , ' 2025-04-24T12:00:08Z ' , ' 69905000 ' , ' 519516 ' , ' 7324.546977 ' , ' 54.032613 ' );
338+ INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES (' 8cf427ff ' , ' 5163 ' , ' AMD Ryzen 7 7800X3D 8- Core Processor ' , ' NVIDIA GeForce RTX 4080 ' , ' CUDA ' , ' models/Qwen2.5-7B-Instruct-Q4_K_M .gguf' , ' qwen2 7B Q4_K - Medium ' , ' 4677120000 ' , ' 7615616512 ' , ' 2048 ' , ' 512' , ' 8 ' , ' 0x0 ' , ' 0 ' , ' 50 ' , ' f16 ' , ' f16 ' , ' 99' , ' layer ' , ' 0' , ' 0 ' , ' 0' , ' 0 .00' , ' 1 ' , ' 0' , ' 0 ' , ' 128' , ' 0 ' , ' 2025-04-24T12:00:09Z ' , ' 1063608780 ' , ' 4464130 ' , ' 120.346696 ' , ' 0.504647 ' );
302339```
0 commit comments