diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index 6bbe4bb75fbf8..008f284fef519 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -25,9 +25,9 @@ usage: ./llama-bench [options] options: -h, --help -m, --model (default: models/7B/ggml-model-q4_0.gguf) - -p, --n-prompt (default: 512) - -n, --n-gen (default: 128) - -pg (default: ) + -p, --n-prompt (default: 0) + -n, --n-gen (default: 32) + -pg (default: 4096,32) -b, --batch-size (default: 2048) -ub, --ubatch-size (default: 512) -ctk, --cache-type-k (default: f16) @@ -74,20 +74,34 @@ Note: ## Examples +### Prompt processing and text generation + +```sh +$ ./llama-bench -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -p 0 -n 0 -pg 100,100 -pg 500,100 +``` + +| model | params | backend | ngl | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | pp100+tg100 | 14303.91 ± 362.95 | 455.50 ± 11.99 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | pp500+tg100 | 28026.49 ± 970.98 | 440.05 ± 4.57 | + + ### Text generation with different models ```sh -$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512 +$ ./llama-bench -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -m models/Llama-3.2-3B-Instruct-Q4_K_M.gguf -p 0 -n 128,256,512 ``` -| model | size | params | backend | ngl | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 | +| model | params | backend | ngl | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg128 | 0.00 ± 0.00 | 469.34 ± 2.16 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg256 | 0.00 ± 0.00 | 459.78 ± 9.43 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg512 | 0.00 ± 0.00 | 449.25 ± 11.74 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | pp4096+tg32 | 15545.82 ± 8.35 | 385.90 ± 3.47 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg128 | 0.00 ± 0.00 | 212.78 ± 5.12 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg256 | 0.00 ± 0.00 | 214.56 ± 2.16 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg512 | 0.00 ± 0.00 | 212.84 ± 1.41 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | pp4096+tg32 | 8825.07 ± 100.28 | 177.25 ± 1.89 | ### Prompt processing with different batch sizes @@ -95,12 +109,16 @@ $ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0. $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 ``` -| model | size | params | backend | ngl | n_batch | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 | +| model | params | backend | ngl | n_batch | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 128 | pp1024 | 17125.18 ± 731.13 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 128 | pp4096+tg32 | 12139.39 ± 446.63 | 378.76 ± 8.18 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 256 | pp1024 | 24112.17 ± 161.18 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 256 | pp4096+tg32 | 14508.80 ± 53.00 | 386.58 ± 0.42 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 512 | pp1024 | 25534.56 ± 368.03 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 512 | pp4096+tg32 | 15388.41 ± 13.06 | 386.30 ± 0.53 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1024 | pp1024 | 25654.61 ± 772.86 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1024 | pp4096+tg32 | 15487.92 ± 8.59 | 385.20 ± 0.50 | ### Different numbers of threads @@ -108,20 +126,26 @@ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 $ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32 ``` -| model | size | params | backend | threads | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 || +| model | params | backend | ngl | threads | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1 | pp64 | 9229.99 ± 1897.41 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1 | tg16 | 0.00 ± 0.00 | 444.33 ± 25.11 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1 | pp4096+tg32 | 15357.53 ± 27.52 | 373.90 ± 7.03 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 2 | pp64 | 10799.57 ± 33.90 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 2 | tg16 | 0.00 ± 0.00 | 461.43 ± 10.99 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 2 | pp4096+tg32 | 15371.18 ± 57.24 | 372.59 ± 4.02 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 4 | pp64 | 11033.35 ± 177.05 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 4 | tg16 | 0.00 ± 0.00 | 448.57 ± 8.66 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 4 | pp4096+tg32 | 15371.12 ± 43.70 | 376.71 ± 0.93 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 8 | pp64 | 11206.45 ± 187.47 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 8 | tg16 | 0.00 ± 0.00 | 457.99 ± 6.92 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 8 | pp4096+tg32 | 15022.14 ± 161.68 | 369.76 ± 4.71 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 16 | pp64 | 10397.19 ± 304.08 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 16 | tg16 | 0.00 ± 0.00 | 457.53 ± 7.06 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 16 | pp4096+tg32 | 15434.32 ± 158.08 | 372.00 ± 3.34 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 32 | pp64 | 10588.34 ± 1043.71 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 32 | tg16 | 0.00 ± 0.00 | 468.10 ± 9.16 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 32 | pp4096+tg32 | 15544.54 ± 4.30 | 374.14 ± 7.18 | ### Different numbers of layers offloaded to the GPU @@ -129,24 +153,24 @@ $ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32 $ ./llama-bench -ngl 10,20,30,31,32,33,34,35 ``` -| model | size | params | backend | ngl | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 | +| model | params | backend | ngl | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 10 | tg32 | 0.00 ± 0.00 | 107.29 ± 1.37 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 10 | pp4096+tg32 | 8458.79 ± 154.44 | 70.84 ± 0.10 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 20 | tg32 | 0.00 ± 0.00 | 484.02 ± 0.93 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 20 | pp4096+tg32 | 15303.20 ± 120.74 | 372.57 ± 6.32 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 30 | tg32 | 0.00 ± 0.00 | 473.82 ± 4.27 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 30 | pp4096+tg32 | 15372.85 ± 239.94 | 378.99 ± 4.72 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 31 | tg32 | 0.00 ± 0.00 | 474.76 ± 7.11 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 31 | pp4096+tg32 | 15373.12 ± 263.84 | 377.83 ± 12.16 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 32 | tg32 | 0.00 ± 0.00 | 482.19 ± 0.92 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 32 | pp4096+tg32 | 15515.24 ± 15.85 | 369.73 ± 0.23 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 33 | tg32 | 0.00 ± 0.00 | 482.07 ± 0.63 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 33 | pp4096+tg32 | 15299.93 ± 261.50 | 373.32 ± 9.92 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 34 | tg32 | 0.00 ± 0.00 | 482.89 ± 0.99 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 34 | pp4096+tg32 | 15551.65 ± 14.10 | 381.00 ± 6.75 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 35 | tg32 | 0.00 ± 0.00 | 481.55 ± 1.15 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 35 | pp4096+tg32 | 15565.34 ± 5.96 | 385.77 ± 0.25 | ## Output formats @@ -158,10 +182,10 @@ By default, llama-bench outputs the results in markdown format. The results can $ ./llama-bench -o md ``` -| model | size | params | backend | ngl | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 | +| model | params | backend | ngl | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg32 | 0.00 ± 0.00 | 455.34 ± 13.25 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | pp4096+tg32 | 15479.05 ± 93.15 | 383.70 ± 2.79 | ### CSV @@ -170,9 +194,9 @@ $ ./llama-bench -o csv ``` ```csv -build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961" -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342" +build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,test_time,avg_prompt_ns,stddev_prompt_ns,avg_prompt_ts,stddev_prompt_ts,avg_gen_ns,stddev_gen_ns,avg_gen_ts,stddev_gen_ts +"fa6cb8ae","5100","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","32","2025-04-18T11:21:18Z","66","58","0.000000","0.000000","71886000","7590","445.149267","0.046999" +"fa6cb8ae","5100","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","4096","32","2025-04-18T11:21:18Z","272293733","3247466","15044.014817","180.586130","87201066","125581","366.968490","0.525734" ``` ### JSON @@ -184,64 +208,88 @@ $ ./llama-bench -o json ```json [ { - "build_commit": "3469684", - "build_number": 1275, - "cuda": true, - "metal": false, - "gpu_blas": true, - "blas": true, - "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", - "gpu_info": "NVIDIA GeForce RTX 3090 Ti", - "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", - "model_size": 3825065984, - "model_n_params": 6738415616, - "n_batch": 512, - "n_threads": 16, - "f16_kv": true, + "build_commit": "fa6cb8ae", + "build_number": 5100, + "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", + "gpu_info": "NVIDIA GeForce RTX 4080", + "backends": "CUDA", + "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", + "model_type": "llama 1B Q4_K - Medium", + "model_size": 799862912, + "model_n_params": 1235814432, + "n_batch": 2048, + "n_ubatch": 512, + "n_threads": 8, + "cpu_mask": "0x0", + "cpu_strict": false, + "poll": 50, + "type_k": "f16", + "type_v": "f16", "n_gpu_layers": 99, + "split_mode": "layer", "main_gpu": 0, - "mul_mat_q": true, + "no_kv_offload": false, + "flash_attn": false, "tensor_split": "0.00", - "n_prompt": 512, - "n_gen": 0, - "test_time": "2023-09-23T12:09:57Z", - "avg_ns": 212365953, - "stddev_ns": 985423, - "avg_ts": 2410.974041, - "stddev_ts": 11.163766, - "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ], - "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ] + "use_mmap": true, + "embeddings": false, + "n_prompt": 0, + "n_gen": 32, + "test_time": "2025-04-18T11:21:45Z", + "avg_prompt_ns": 66, + "stddev_prompt_ns": 58, + "avg_prompt_ts": 0.000000, + "stddev_prompt_ts": 0.000000, + "avg_gen_ns": 67903233, + "stddev_gen_ns": 498856, + "avg_gen_ts": 471.275875, + "stddev_gen_ts": 3.475513, + "samples_prompt_ns": [ 100, 0, 100 ], + "samples_prompt_ts": [ 0 ] + "samples_gen_ns": [ 68251300, 68126600, 67331800 ], + "samples_gen_ts": [ 468.856, 469.714, 475.258 ] }, { - "build_commit": "3469684", - "build_number": 1275, - "cuda": true, - "metal": false, - "gpu_blas": true, - "blas": true, - "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", - "gpu_info": "NVIDIA GeForce RTX 3090 Ti", - "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", - "model_size": 3825065984, - "model_n_params": 6738415616, - "n_batch": 512, - "n_threads": 16, - "f16_kv": true, + "build_commit": "fa6cb8ae", + "build_number": 5100, + "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", + "gpu_info": "NVIDIA GeForce RTX 4080", + "backends": "CUDA", + "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", + "model_type": "llama 1B Q4_K - Medium", + "model_size": 799862912, + "model_n_params": 1235814432, + "n_batch": 2048, + "n_ubatch": 512, + "n_threads": 8, + "cpu_mask": "0x0", + "cpu_strict": false, + "poll": 50, + "type_k": "f16", + "type_v": "f16", "n_gpu_layers": 99, + "split_mode": "layer", "main_gpu": 0, - "mul_mat_q": true, + "no_kv_offload": false, + "flash_attn": false, "tensor_split": "0.00", - "n_prompt": 0, - "n_gen": 128, - "test_time": "2023-09-23T12:09:59Z", - "avg_ns": 977425219, - "stddev_ns": 9268593, - "avg_ts": 130.965708, - "stddev_ts": 1.238924, - "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ], - "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ] + "use_mmap": true, + "embeddings": false, + "n_prompt": 4096, + "n_gen": 32, + "test_time": "2025-04-18T11:21:46Z", + "avg_prompt_ns": 263273600, + "stddev_prompt_ns": 273278, + "avg_prompt_ts": 15557.970647, + "stddev_prompt_ts": 16.143068, + "avg_gen_ns": 85820333, + "stddev_gen_ns": 4372337, + "avg_gen_ts": 373.500825, + "stddev_gen_ts": 18.514532, + "samples_prompt_ns": [ 263043600, 263201500, 263575700 ], + "samples_prompt_ts": [ 15571.6, 15562.2, 15540.1 ] + "samples_gen_ns": [ 82844300, 83776400, 90840300 ], + "samples_gen_ts": [ 386.267, 381.969, 352.267 ] } ] ``` @@ -254,8 +302,8 @@ $ ./llama-bench -o jsonl ``` ```json lines -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]} -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]} +{"build_commit": "fa6cb8ae", "build_number": 5100, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 32, "test_time": "2025-04-18T11:22:14Z", "avg_prompt_ns": 100, "stddev_prompt_ns": 0, "avg_prompt_ts": 0.000000, "stddev_prompt_ts": 0.000000, "avg_gen_ns": 71156300, "stddev_gen_ns": 912152, "avg_gen_ts": 449.763857, "stddev_gen_ts": 5.808090, "samples_prompt_ns": [ 100, 100, 100 ],"samples_prompt_ts": [ 0 ]"samples_gen_ns": [ 71725200, 71639500, 70104200 ],"samples_gen_ts": [ 446.147, 446.681, 456.463 ]} +{"build_commit": "fa6cb8ae", "build_number": 5100, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 4096, "n_gen": 32, "test_time": "2025-04-18T11:22:14Z", "avg_prompt_ns": 267673800, "stddev_prompt_ns": 4917668, "avg_prompt_ts": 15305.627579, "stddev_prompt_ts": 279.255714, "avg_gen_ns": 83914500, "stddev_gen_ns": 1515058, "avg_gen_ts": 381.422650, "stddev_gen_ts": 6.822569, "samples_prompt_ns": [ 266315000, 273128000, 263578400 ],"samples_prompt_ts": [ 15380.3, 14996.6, 15540 ]"samples_gen_ns": [ 85644600, 83274100, 82824800 ],"samples_gen_ts": [ 373.637, 384.273, 386.358 ]} ``` @@ -271,32 +319,42 @@ $ ./llama-bench -o sql CREATE TABLE IF NOT EXISTS test ( build_commit TEXT, build_number INTEGER, - cuda INTEGER, - metal INTEGER, - gpu_blas INTEGER, - blas INTEGER, cpu_info TEXT, gpu_info TEXT, + backends TEXT, model_filename TEXT, model_type TEXT, model_size INTEGER, model_n_params INTEGER, n_batch INTEGER, + n_ubatch INTEGER, n_threads INTEGER, - f16_kv INTEGER, + cpu_mask TEXT, + cpu_strict INTEGER, + poll INTEGER, + type_k TEXT, + type_v TEXT, n_gpu_layers INTEGER, + split_mode TEXT, main_gpu INTEGER, - mul_mat_q INTEGER, + no_kv_offload INTEGER, + flash_attn INTEGER, tensor_split TEXT, + use_mmap INTEGER, + embeddings INTEGER, n_prompt INTEGER, n_gen INTEGER, test_time TEXT, - avg_ns INTEGER, - stddev_ns INTEGER, - avg_ts REAL, - stddev_ts REAL + avg_prompt_ns INTEGER, + stddev_prompt_ns INTEGER, + avg_prompt_ts REAL, + stddev_prompt_ts REAL, + avg_gen_ns INTEGER, + stddev_gen_ns INTEGER, + avg_gen_ts REAL, + stddev_gen_ts REAL ); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634'); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('fa6cb8ae', '5100', 'AMD Ryzen 7 7800X3D 8-Core Processor ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '32', '2025-04-18T11:22:37Z', '66', '58', '0.000000', '0.000000', '70741266', '2050337', '452.606173', '13.122321'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('fa6cb8ae', '5100', 'AMD Ryzen 7 7800X3D 8-Core Processor ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '4096', '32', '2025-04-18T11:22:37Z', '270934866', '4466069', '15120.737903', '246.900896', '85258733', '2156168', '375.487736', '9.468350'); ``` diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index cbcbfcee861ee..29c45b55b904b 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -189,9 +189,9 @@ struct cmd_params { static const cmd_params cmd_params_defaults = { /* model */ { "models/7B/ggml-model-q4_0.gguf" }, - /* n_prompt */ { 512 }, - /* n_gen */ { 128 }, - /* n_pg */ {}, + /* n_prompt */ { 0 }, + /* n_gen */ { 32 }, + /* n_pg */ { { 4096, 32 } }, /* n_batch */ { 2048 }, /* n_ubatch */ { 512 }, /* type_k */ { GGML_TYPE_F16 }, @@ -210,7 +210,7 @@ static const cmd_params cmd_params_defaults = { /* use_mmap */ { true }, /* embeddings */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, - /* reps */ 5, + /* reps */ 3, /* prio */ GGML_SCHED_PRIO_NORMAL, /* delay */ 0, /* verbose */ false, @@ -901,7 +901,8 @@ struct test { int n_prompt; int n_gen; std::string test_time; - std::vector samples_ns; + std::vector samples_prompt_ns; // prompt processing latency + std::vector samples_gen_ns; // token generation latency test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) : cpu_info(get_cpu_info()), @@ -939,21 +940,33 @@ struct test { (void) ctx; } - uint64_t avg_ns() const { return ::avg(samples_ns); } + uint64_t avg_prompt_ns() const { return ::avg(samples_prompt_ns); } + uint64_t avg_gen_ns() const { return ::avg(samples_gen_ns); } - uint64_t stdev_ns() const { return ::stdev(samples_ns); } + uint64_t stddev_prompt_ns() const { return ::stdev(samples_prompt_ns); } + uint64_t stddev_gen_ns() const { return ::stdev(samples_gen_ns); } - std::vector get_ts() const { - int n_tokens = n_prompt + n_gen; + std::vector get_ts(const std::vector & samples_ns, int n_tokens) const { + if(n_tokens==0) + return {0}; std::vector ts; std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); return ts; } + + std::vector get_prompt_ts() const { + return get_ts(samples_prompt_ns, n_prompt); + } + std::vector get_gen_ts() const { + return get_ts(samples_gen_ns, n_gen); + } - double avg_ts() const { return ::avg(get_ts()); } + double avg_prompt_ts() const { return ::avg(get_prompt_ts()); } + double avg_gen_ts() const { return ::avg(get_gen_ts()); } - double stdev_ts() const { return ::stdev(get_ts()); } + double stdev_prompt_ts() const { return ::stdev(get_prompt_ts()); } + double stdev_gen_ts() const { return ::stdev(get_gen_ts()); } static std::string get_backend() { std::vector backends; @@ -969,12 +982,13 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", - "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", - "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", + "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", + "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", + "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", + "embeddings", "n_prompt", "n_gen", "test_time", + "avg_prompt_ns", "stddev_prompt_ns", "avg_prompt_ts", "stddev_prompt_ts", + "avg_gen_ns", "stddev_gen_ns", "avg_gen_ts", "stddev_gen_ts" }; return fields; } @@ -984,15 +998,15 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || - field == "stddev_ns") { + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_prompt_ns" || field == "stddev_prompt_ns" || + field == "avg_gen_ns" || field == "stddev_gen_ns") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || field == "use_mmap" || field == "embeddings") { return BOOL; } - if (field == "avg_ts" || field == "stddev_ts") { + if (field == "avg_prompt_ts" || field == "stddev_prompt_ts" || field == "avg_gen_ts" || field == "stddev_gen_ts") { return FLOAT; } return STRING; @@ -1042,10 +1056,14 @@ struct test { std::to_string(n_prompt), std::to_string(n_gen), test_time, - std::to_string(avg_ns()), - std::to_string(stdev_ns()), - std::to_string(avg_ts()), - std::to_string(stdev_ts()) }; + std::to_string(avg_prompt_ns()), + std::to_string(stddev_prompt_ns()), + std::to_string(avg_prompt_ts()), + std::to_string(stdev_prompt_ts()), + std::to_string(avg_gen_ns()), + std::to_string(stddev_gen_ns()), + std::to_string(avg_gen_ts()), + std::to_string(stdev_gen_ts()) }; return values; } @@ -1153,8 +1171,10 @@ struct json_printer : public printer { } fprintf(fout, " {\n"); print_fields(test::get_fields(), t.get_values()); - fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str()); - fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str()); + fprintf(fout, " \"samples_prompt_ns\": [ %s ],\n", join(t.samples_prompt_ns, ", ").c_str()); + fprintf(fout, " \"samples_prompt_ts\": [ %s ]\n", join(t.get_prompt_ts(), ", ").c_str()); + fprintf(fout, " \"samples_gen_ns\": [ %s ],\n", join(t.samples_gen_ns, ", ").c_str()); + fprintf(fout, " \"samples_gen_ts\": [ %s ]\n", join(t.get_gen_ts(), ", ").c_str()); fprintf(fout, " }"); fflush(fout); } @@ -1173,8 +1193,10 @@ struct jsonl_printer : public printer { void print_test(const test & t) override { fprintf(fout, "{"); print_fields(test::get_fields(), t.get_values()); - fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str()); - fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str()); + fprintf(fout, "\"samples_prompt_ns\": [ %s ],", join(t.samples_prompt_ns, ", ").c_str()); + fprintf(fout, "\"samples_prompt_ts\": [ %s ]", join(t.get_prompt_ts(), ", ").c_str()); + fprintf(fout, "\"samples_gen_ns\": [ %s ],", join(t.samples_gen_ns, ", ").c_str()); + fprintf(fout, "\"samples_gen_ts\": [ %s ]", join(t.get_gen_ts(), ", ").c_str()); fprintf(fout, "}\n"); fflush(fout); } @@ -1187,8 +1209,11 @@ struct markdown_printer : public printer { if (field == "model") { return -30; } - if (field == "t/s") { - return 20; + if (field == "prompt t/s") { + return 18; + } + if (field == "gen t/s") { + return 15; } if (field == "size" || field == "params") { return 10; @@ -1260,7 +1285,6 @@ struct markdown_printer : public printer { void print_header(const cmd_params & params) override { // select fields to print fields.emplace_back("model"); - fields.emplace_back("size"); fields.emplace_back("params"); fields.emplace_back("backend"); bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos || @@ -1314,7 +1338,8 @@ struct markdown_printer : public printer { fields.emplace_back("embeddings"); } fields.emplace_back("test"); - fields.emplace_back("t/s"); + fields.emplace_back("prompt t/s"); + fields.emplace_back("gen t/s"); fprintf(fout, "|"); for (const auto & field : fields) { @@ -1363,8 +1388,11 @@ struct markdown_printer : public printer { snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); } value = buf; - } else if (field == "t/s") { - snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts()); + } else if (field == "prompt t/s") { + snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_prompt_ts(), t.stdev_prompt_ts()); + value = buf; + } else if (field == "gen t/s") { + snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_gen_ts(), t.stdev_gen_ts()); value = buf; } else if (vmap.find(field) != vmap.end()) { value = vmap.at(field); @@ -1374,7 +1402,7 @@ struct markdown_printer : public printer { } int width = get_field_width(field); - if (field == "t/s") { + if (field == "prompt t/s" || field == "gen t/s") { // HACK: the utf-8 character is 2 bytes width += 1; } @@ -1629,6 +1657,9 @@ int main(int argc, char ** argv) { } test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } + + uint64_t t_gen_start = get_time_ns(); + if (t.n_gen > 0) { if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count, @@ -1637,8 +1668,13 @@ int main(int argc, char ** argv) { test_gen(ctx, t.n_gen, t.n_threads); } - uint64_t t_ns = get_time_ns() - t_start; - t.samples_ns.push_back(t_ns); + uint64_t t_end = get_time_ns(); + + uint64_t prompt_ns = t_gen_start - t_start; + uint64_t gen_ns = t_end - t_gen_start; + + t.samples_prompt_ns.push_back(prompt_ns); + t.samples_gen_ns.push_back(gen_ns); } if (p) {