diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index bf7fd29c8c55f..ab3f9780ada57 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -5,12 +5,13 @@ Performance testing tool for llama.cpp. ## Table of contents 1. [Syntax](#syntax) -2. [Examples](#examples) +2. [Metrics](#metrics) +3. [Examples](#examples) 1. [Text generation with different models](#text-generation-with-different-models) 2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes) 3. [Different numbers of threads](#different-numbers-of-threads) 4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu) -3. [Output formats](#output-formats) +4. [Output formats](#output-formats) 1. [Markdown](#markdown) 2. [CSV](#csv) 3. [JSON](#json) @@ -79,6 +80,33 @@ Using the `-d ` option, each test can be run at a specified context depth, pr For a description of the other options, see the [main example](../main/README.md). +## Metrics + +### Time to First Token (TTFT) + +$$ T_{ttft} = t_{prompt} + t^{(1)}_{gen} $$ + +where +* $t_{prompt}$ : total prompt processing time +* $t^{(1)}_{gen}$ : token generation time for the first token + +> **_NOTE_**: This is only meaningful with a `-pg` test. + +### End-to-End Request Latency (E2E) + +$$ T_{e2e} = t_{prompt} + t_{gen} $$ + +where +* $t_{prompt}$ : total prompt processing time +* $t_{gen}$ : total token generation time + +### Inter-token Latency (ITL) + +$$ T_{itl} = \frac{T_{e2e} - T_{ttft}}{n\_{gen} - 1} $$ + +where +* $n\_{gen}$ : tokens to generate (`-n` flag) + ## Examples ### Text generation with different models @@ -190,9 +218,9 @@ $ ./llama-bench -o csv ``` ```csv -build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts -"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434" -"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617" +build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,tensor_buft_overrides,use_mmap,embeddings,no_op_offload,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts,avg_ttft_ms,stddev_ttft_ms,avg_e2e_ms,stddev_e2e_ms,avg_itl_ms,stddev_itl_ms +"69f7e7116","6321","OpenBLAS, CPU","","BLAS","models/granite-3.3-2b-instruct-be.IQ4_XS.gguf","granite 3B IQ4_XS - 4.25 bpw","1395281392","2533539840","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","none","1","0","0","512","0","0","2025-08-28T16:57:00Z","6313008195","4119608","81.102409","0.052946","0.000000","0.000000","6313.008195","4.118842","0.000000","0.000000" +"69f7e7116","6321","OpenBLAS, CPU","","BLAS","models/granite-3.3-2b-instruct-be.IQ4_XS.gguf","granite 3B IQ4_XS - 4.25 bpw","1395281392","2533539840","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","none","1","0","0","0","128","0","2025-08-28T16:57:38Z","5510702782","66734031","23.230240","0.280041","42.929458","0.403130","5510.702782","66.734011","43.053333","0.522383" ``` ### JSON @@ -204,15 +232,15 @@ $ ./llama-bench -o json ```json [ { - "build_commit": "8cf427ff", - "build_number": 5163, - "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", - "gpu_info": "NVIDIA GeForce RTX 4080", - "backends": "CUDA", - "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", - "model_type": "qwen2 7B Q4_K - Medium", - "model_size": 4677120000, - "model_n_params": 7615616512, + "build_commit": "69f7e7116", + "build_number": 6321, + "cpu_info": "OpenBLAS, CPU", + "gpu_info": "", + "backends": "BLAS", + "model_filename": "models/granite-3.3-2b-instruct-be.IQ4_XS.gguf", + "model_type": "granite 3B IQ4_XS - 4.25 bpw", + "model_size": 1395281392, + "model_n_params": 2533539840, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, @@ -227,29 +255,39 @@ $ ./llama-bench -o json "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", + "tensor_buft_overrides": "none", "use_mmap": true, "embeddings": false, + "no_op_offload": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, - "test_time": "2025-04-24T11:58:50Z", - "avg_ns": 72135640, - "stddev_ns": 1453752, - "avg_ts": 7100.002165, - "stddev_ts": 140.341520, - "samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ], - "samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ] + "test_time": "2025-08-28T17:01:34Z", + "avg_ns": 6276064173, + "stddev_ns": 34323113, + "avg_ts": 81.581735, + "stddev_ts": 0.444487, + "avg_ttft_ms": 0.000000, + "stddev_ttft_ms": 0.000000, + "avg_e2e_ms": 6276.064174, + "stddev_e2e_ms": 34.322931, + "avg_itl_ms": 0.000000, + "stddev_itl_ms": 0.000000, + "samples_ns": [ 6255489794, 6293138165, 6328736359, 6254136857, 6248819694 ], + "samples_ts": [ 81.8481, 81.3585, 80.9008, 81.8658, 81.9355 ], + "samples_ttft_ns": [ ], + "samples_itl_ns": [ ] }, { - "build_commit": "8cf427ff", - "build_number": 5163, - "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", - "gpu_info": "NVIDIA GeForce RTX 4080", - "backends": "CUDA", - "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", - "model_type": "qwen2 7B Q4_K - Medium", - "model_size": 4677120000, - "model_n_params": 7615616512, + "build_commit": "69f7e7116", + "build_number": 6321, + "cpu_info": "OpenBLAS, CPU", + "gpu_info": "", + "backends": "BLAS", + "model_filename": "models/granite-3.3-2b-instruct-be.IQ4_XS.gguf", + "model_type": "granite 3B IQ4_XS - 4.25 bpw", + "model_size": 1395281392, + "model_n_params": 2533539840, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, @@ -264,18 +302,28 @@ $ ./llama-bench -o json "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", + "tensor_buft_overrides": "none", "use_mmap": true, "embeddings": false, + "no_op_offload": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, - "test_time": "2025-04-24T11:58:51Z", - "avg_ns": 1076767880, - "stddev_ns": 9449585, - "avg_ts": 118.881588, - "stddev_ts": 1.041811, - "samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ], - "samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ] + "test_time": "2025-08-28T17:02:12Z", + "avg_ns": 5613693967, + "stddev_ns": 9159226, + "avg_ts": 22.801434, + "stddev_ts": 0.037157, + "avg_ttft_ms": 43.766002, + "stddev_ttft_ms": 0.161696, + "avg_e2e_ms": 5613.693967, + "stddev_e2e_ms": 9.158920, + "avg_itl_ms": 43.857701, + "stddev_itl_ms": 0.073127, + "samples_ns": [ 5617273869, 5609876644, 5628089934, 5607197607, 5606031783 ], + "samples_ts": [ 22.7869, 22.8169, 22.7431, 22.8278, 22.8325 ], + "samples_ttft_ns": [ 43584301, 43738022, 43642511, 43936642, 43928534 ], + "samples_itl_ns": [ 4.38873e+07, 4.38279e+07, 4.3972e+07, 4.38052e+07, 4.37961e+07 ] } ] ``` @@ -288,8 +336,8 @@ $ ./llama-bench -o jsonl ``` ```json lines -{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]} -{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]} +{"build_commit": "69f7e7116", "build_number": 6321, "cpu_info": "OpenBLAS, CPU", "gpu_info": "", "backends": "BLAS", "model_filename": "models/granite-3.3-2b-instruct-be.IQ4_XS.gguf", "model_type": "granite 3B IQ4_XS - 4.25 bpw", "model_size": 1395281392, "model_n_params": 2533539840, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "embeddings": false, "no_op_offload": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-08-28T17:05:56Z", "avg_ns": 6287855687, "stddev_ns": 24678326, "avg_ts": 81.427806, "stddev_ts": 0.318701, "avg_ttft_ms": 0.000000, "stddev_ttft_ms": 0.000000, "avg_e2e_ms": 6287.855687, "stddev_e2e_ms": 24.678263, "avg_itl_ms": 0.000000, "stddev_itl_ms": 0.000000, "samples_ns": [ 6278179171, 6280580749, 6291595583, 6327539008, 6261383925 ],"samples_ts": [ 81.5523, 81.5211, 81.3784, 80.9161, 81.7711 ],"samples_ttft_ns": [ ],"samples_itl_ns": [ ]} +{"build_commit": "69f7e7116", "build_number": 6321, "cpu_info": "OpenBLAS, CPU", "gpu_info": "", "backends": "BLAS", "model_filename": "models/granite-3.3-2b-instruct-be.IQ4_XS.gguf", "model_type": "granite 3B IQ4_XS - 4.25 bpw", "model_size": 1395281392, "model_n_params": 2533539840, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "embeddings": false, "no_op_offload": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-08-28T17:06:33Z", "avg_ns": 5498943758, "stddev_ns": 137464807, "avg_ts": 23.288635, "stddev_ts": 0.571937, "avg_ttft_ms": 43.519547, "stddev_ttft_ms": 1.625999, "avg_e2e_ms": 5498.943758, "stddev_e2e_ms": 137.464807, "avg_itl_ms": 42.956096, "stddev_itl_ms": 1.075456, "samples_ns": [ 5563859512, 5712210875, 5398301124, 5410462401, 5409884878 ],"samples_ts": [ 23.0056, 22.4081, 23.7112, 23.6579, 23.6604 ],"samples_ttft_ns": [ 46182840, 43768889, 43176480, 42210650, 42258878 ],"samples_itl_ns": [ 4.34463e+07, 4.46334e+07, 4.21663e+07, 4.22697e+07, 4.22648e+07 ]} ``` @@ -302,7 +350,7 @@ $ ./llama-bench -o sql ``` ```sql -CREATE TABLE IF NOT EXISTS test ( +CREATE TABLE IF NOT EXISTS llama_bench ( build_commit TEXT, build_number INTEGER, cpu_info TEXT, @@ -326,8 +374,10 @@ CREATE TABLE IF NOT EXISTS test ( no_kv_offload INTEGER, flash_attn INTEGER, tensor_split TEXT, + tensor_buft_overrides TEXT, use_mmap INTEGER, embeddings INTEGER, + no_op_offload INTEGER, n_prompt INTEGER, n_gen INTEGER, n_depth INTEGER, @@ -335,9 +385,15 @@ CREATE TABLE IF NOT EXISTS test ( avg_ns INTEGER, stddev_ns INTEGER, avg_ts REAL, - stddev_ts REAL + stddev_ts REAL, + avg_ttft_ms REAL, + stddev_ttft_ms REAL, + avg_e2e_ms REAL, + stddev_e2e_ms REAL, + avg_itl_ms REAL, + stddev_itl_ms REAL ); -INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613'); -INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647'); +INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, tensor_buft_overrides, use_mmap, embeddings, no_op_offload, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts, avg_ttft_ms, stddev_ttft_ms, avg_e2e_ms, stddev_e2e_ms, avg_itl_ms, stddev_itl_ms) VALUES ('69f7e7116', '6321', 'OpenBLAS, CPU', '', 'BLAS', 'models/granite-3.3-2b-instruct-be.IQ4_XS.gguf', 'granite 3B IQ4_XS - 4.25 bpw', '1395281392', '2533539840', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', 'none', '1', '0', '0', '512', '0', '0', '2025-08-28T17:08:13Z', '6326649352', '9345621', '80.927655', '0.119504', '0.000000', '0.000000', '6326.649353', '9.344944', '0.000000', '0.000000'); +INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, tensor_buft_overrides, use_mmap, embeddings, no_op_offload, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts, avg_ttft_ms, stddev_ttft_ms, avg_e2e_ms, stddev_e2e_ms, avg_itl_ms, stddev_itl_ms) VALUES ('69f7e7116', '6321', 'OpenBLAS, CPU', '', 'BLAS', 'models/granite-3.3-2b-instruct-be.IQ4_XS.gguf', 'granite 3B IQ4_XS - 4.25 bpw', '1395281392', '2533539840', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', 'none', '1', '0', '0', '0', '128', '0', '2025-08-28T17:08:51Z', '5640474378', '40328612', '22.694063', '0.163702', '44.155657', '0.086653', '5640.474378', '40.328543', '44.065502', '0.317725'); ``` diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 9378706a12a7c..b683265b0e1c6 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -1161,6 +1161,7 @@ struct test { int n_depth; std::string test_time; std::vector samples_ns; + std::vector samples_ttft_ns; test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) : cpu_info(get_cpu_info()), @@ -1205,6 +1206,10 @@ struct test { uint64_t stdev_ns() const { return ::stdev(samples_ns); } + uint64_t avg_ttft_ns() const { return ::avg(samples_ttft_ns); } + + uint64_t stdev_ttft_ns() const { return ::stdev(samples_ttft_ns); } + std::vector get_ts() const { int n_tokens = n_prompt + n_gen; std::vector ts; @@ -1213,10 +1218,57 @@ struct test { return ts; } + std::vector get_ttft_ms() const { + std::vector ttft_ms; + std::transform(samples_ttft_ns.begin(), samples_ttft_ns.end(), std::back_inserter(ttft_ms), + [](uint64_t t) { return t / 1e6; }); + return ttft_ms; + } + + std::vector get_e2e_ms() const { + std::vector e2e_ms; + std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(e2e_ms), + [](uint64_t t) { return t / 1e6; }); + return e2e_ms; + } + + std::vector get_itl_ns() const { + std::vector itl_ns; + if (n_gen == 0) return itl_ns; + + for (size_t i = 0; i < samples_ns.size(); i++) { + double e2e_ns = samples_ns[i]; + double ttft_ns = samples_ttft_ns[i]; + double itl = (e2e_ns - ttft_ns) / (n_gen - 1); + itl_ns.push_back(itl); + } + return itl_ns; + } + + std::vector get_itl_ms() const { + std::vector itl_ns = get_itl_ns(); + std::vector itl_ms; + std::transform(itl_ns.begin(), itl_ns.end(), std::back_inserter(itl_ms), + [](double t) { return t / 1e6; }); + return itl_ms; + } + double avg_ts() const { return ::avg(get_ts()); } double stdev_ts() const { return ::stdev(get_ts()); } + double avg_ttft_ms() const { return ::avg(get_ttft_ms()); } + + double stdev_ttft_ms() const { return ::stdev(get_ttft_ms()); } + + double avg_e2e_ms() const { return ::avg(get_e2e_ms()); } + + double stdev_e2e_ms() const { return ::stdev(get_e2e_ms()); } + + double avg_itl_ms() const { return ::avg(get_itl_ms()); } + + double stdev_itl_ms() const { return ::stdev(get_itl_ms()); } + static std::string get_backend() { std::vector backends; for (size_t i = 0; i < ggml_backend_reg_count(); i++) { @@ -1231,12 +1283,13 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", - "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", - "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", + "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", + "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", + "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", + "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", + "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "avg_ttft_ms", "stddev_ttft_ms", + "avg_e2e_ms", "stddev_e2e_ms", "avg_itl_ms", "stddev_itl_ms", }; return fields; } @@ -1244,17 +1297,18 @@ struct test { enum field_type { STRING, BOOL, INT, FLOAT }; static field_type get_field_type(const std::string & field) { - if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || - field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || - field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") { + if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || + field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || + field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") { return INT; } - if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || + if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || field == "use_mmap" || field == "embeddings") { return BOOL; } - if (field == "avg_ts" || field == "stddev_ts") { + if (field == "avg_ts" || field == "stddev_ts" || field == "avg_ttft_ms" || field == "stddev_ttft_ms" || + field == "avg_e2e_ms" || field == "stddev_e2e_ms" || field == "avg_itl_ms" || field == "stddev_itl_ms") { return FLOAT; } return STRING; @@ -1331,7 +1385,13 @@ struct test { std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), - std::to_string(stdev_ts()) }; + std::to_string(stdev_ts()), + std::to_string(avg_ttft_ms()), + std::to_string(stdev_ttft_ms()), + std::to_string(avg_e2e_ms()), + std::to_string(stdev_e2e_ms()), + std::to_string(avg_itl_ms()), + std::to_string(stdev_itl_ms()) }; return values; } @@ -1440,7 +1500,9 @@ struct json_printer : public printer { fprintf(fout, " {\n"); print_fields(test::get_fields(), t.get_values()); fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str()); - fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str()); + fprintf(fout, " \"samples_ts\": [ %s ],\n", join(t.get_ts(), ", ").c_str()); + fprintf(fout, " \"samples_ttft_ns\": [ %s ],\n", join(t.samples_ttft_ns, ", ").c_str()); + fprintf(fout, " \"samples_itl_ns\": [ %s ]\n", join(t.get_itl_ns(), ", ").c_str()); fprintf(fout, " }"); fflush(fout); } @@ -1460,7 +1522,9 @@ struct jsonl_printer : public printer { fprintf(fout, "{"); print_fields(test::get_fields(), t.get_values()); fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str()); - fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str()); + fprintf(fout, "\"samples_ts\": [ %s ],", join(t.get_ts(), ", ").c_str()); + fprintf(fout, "\"samples_ttft_ns\": [ %s ],", join(t.samples_ttft_ns, ", ").c_str()); + fprintf(fout, "\"samples_itl_ns\": [ %s ]", join(t.get_itl_ns(), ", ").c_str()); fprintf(fout, "}\n"); fflush(fout); } @@ -1761,7 +1825,7 @@ static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th return true; } -static bool test_gen(llama_context * ctx, int n_gen, int n_threads) { +static bool test_gen(llama_context * ctx, int n_gen, int n_threads, uint64_t * t_ttft) { llama_set_n_threads(ctx, n_threads, n_threads); const llama_model * model = llama_get_model(ctx); @@ -1778,6 +1842,12 @@ static bool test_gen(llama_context * ctx, int n_gen, int n_threads) { } llama_synchronize(ctx); token = std::rand() % n_vocab; + + // capture the time to first token + // t_ttft may be a nullptr from the warmup run + if (i == 0 && t_ttft != nullptr) { + *t_ttft = get_time_ns(); + } } return true; } @@ -1935,7 +2005,7 @@ int main(int argc, char ** argv) { if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count); } - bool res = test_gen(ctx, 1, t.n_threads); + bool res = test_gen(ctx, 1, t.n_threads, nullptr); if (!res) { fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__); exit(1); @@ -1959,6 +2029,7 @@ int main(int argc, char ** argv) { } uint64_t t_start = get_time_ns(); + uint64_t t_ttft = 0; if (t.n_prompt > 0) { if (params.progress) { @@ -1976,11 +2047,12 @@ int main(int argc, char ** argv) { fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); } - bool res = test_gen(ctx, t.n_gen, t.n_threads); + bool res = test_gen(ctx, t.n_gen, t.n_threads, &t_ttft); if (!res) { fprintf(stderr, "%s: error: failed to run gen\n", __func__); exit(1); } + t.samples_ttft_ns.push_back(t_ttft - t_start); } uint64_t t_ns = get_time_ns() - t_start;