diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md
index bf7fd29c8c55f..ab3f9780ada57 100644
--- a/tools/llama-bench/README.md
+++ b/tools/llama-bench/README.md
@@ -5,12 +5,13 @@ Performance testing tool for llama.cpp.
 ## Table of contents
 
 1. [Syntax](#syntax)
-2. [Examples](#examples)
+2. [Metrics](#metrics)
+3. [Examples](#examples)
     1. [Text generation with different models](#text-generation-with-different-models)
     2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes)
     3. [Different numbers of threads](#different-numbers-of-threads)
     4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu)
-3. [Output formats](#output-formats)
+4. [Output formats](#output-formats)
     1. [Markdown](#markdown)
     2. [CSV](#csv)
     3. [JSON](#json)
@@ -79,6 +80,33 @@ Using the `-d <n>` option, each test can be run at a specified context depth, pr
 
 For a description of the other options, see the [main example](../main/README.md).
 
+## Metrics
+
+### Time to First Token (TTFT)
+
+$$ T_{ttft} = t_{prompt} + t^{(1)}_{gen} $$
+
+where
+* $t_{prompt}$ : total prompt processing time
+* $t^{(1)}_{gen}$ : token generation time for the first token
+
+> **_NOTE_**: This is only meaningful with a `-pg` test.
+
+### End-to-End Request Latency (E2E)
+
+$$ T_{e2e} = t_{prompt} + t_{gen} $$
+
+where
+* $t_{prompt}$ : total prompt processing time
+* $t_{gen}$ : total token generation time
+
+### Inter-token Latency (ITL)
+
+$$ T_{itl} = \frac{T_{e2e} - T_{ttft}}{n\_{gen} - 1} $$
+
+where
+* $n\_{gen}$ : tokens to generate (`-n` flag)
+
 ## Examples
 
 ### Text generation with different models
@@ -190,9 +218,9 @@ $ ./llama-bench -o csv
 ```
 
 ```csv
-build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
-"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
-"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
+build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,tensor_buft_overrides,use_mmap,embeddings,no_op_offload,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts,avg_ttft_ms,stddev_ttft_ms,avg_e2e_ms,stddev_e2e_ms,avg_itl_ms,stddev_itl_ms
+"69f7e7116","6321","OpenBLAS, CPU","","BLAS","models/granite-3.3-2b-instruct-be.IQ4_XS.gguf","granite 3B IQ4_XS - 4.25 bpw","1395281392","2533539840","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","none","1","0","0","512","0","0","2025-08-28T16:57:00Z","6313008195","4119608","81.102409","0.052946","0.000000","0.000000","6313.008195","4.118842","0.000000","0.000000"
+"69f7e7116","6321","OpenBLAS, CPU","","BLAS","models/granite-3.3-2b-instruct-be.IQ4_XS.gguf","granite 3B IQ4_XS - 4.25 bpw","1395281392","2533539840","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","none","1","0","0","0","128","0","2025-08-28T16:57:38Z","5510702782","66734031","23.230240","0.280041","42.929458","0.403130","5510.702782","66.734011","43.053333","0.522383"
 ```
 
 ### JSON
@@ -204,15 +232,15 @@ $ ./llama-bench -o json
 ```json
 [
   {
-    "build_commit": "8cf427ff",
-    "build_number": 5163,
-    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
-    "gpu_info": "NVIDIA GeForce RTX 4080",
-    "backends": "CUDA",
-    "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
-    "model_type": "qwen2 7B Q4_K - Medium",
-    "model_size": 4677120000,
-    "model_n_params": 7615616512,
+    "build_commit": "69f7e7116",
+    "build_number": 6321,
+    "cpu_info": "OpenBLAS, CPU",
+    "gpu_info": "",
+    "backends": "BLAS",
+    "model_filename": "models/granite-3.3-2b-instruct-be.IQ4_XS.gguf",
+    "model_type": "granite 3B IQ4_XS - 4.25 bpw",
+    "model_size": 1395281392,
+    "model_n_params": 2533539840,
     "n_batch": 2048,
     "n_ubatch": 512,
     "n_threads": 8,
@@ -227,29 +255,39 @@ $ ./llama-bench -o json
     "no_kv_offload": false,
     "flash_attn": false,
     "tensor_split": "0.00",
+    "tensor_buft_overrides": "none",
     "use_mmap": true,
     "embeddings": false,
+    "no_op_offload": 0,
     "n_prompt": 512,
     "n_gen": 0,
     "n_depth": 0,
-    "test_time": "2025-04-24T11:58:50Z",
-    "avg_ns": 72135640,
-    "stddev_ns": 1453752,
-    "avg_ts": 7100.002165,
-    "stddev_ts": 140.341520,
-    "samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ],
-    "samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ]
+    "test_time": "2025-08-28T17:01:34Z",
+    "avg_ns": 6276064173,
+    "stddev_ns": 34323113,
+    "avg_ts": 81.581735,
+    "stddev_ts": 0.444487,
+    "avg_ttft_ms": 0.000000,
+    "stddev_ttft_ms": 0.000000,
+    "avg_e2e_ms": 6276.064174,
+    "stddev_e2e_ms": 34.322931,
+    "avg_itl_ms": 0.000000,
+    "stddev_itl_ms": 0.000000,
+    "samples_ns": [ 6255489794, 6293138165, 6328736359, 6254136857, 6248819694 ],
+    "samples_ts": [ 81.8481, 81.3585, 80.9008, 81.8658, 81.9355 ],
+    "samples_ttft_ns": [  ],
+    "samples_itl_ns": [  ]
   },
   {
-    "build_commit": "8cf427ff",
-    "build_number": 5163,
-    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
-    "gpu_info": "NVIDIA GeForce RTX 4080",
-    "backends": "CUDA",
-    "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
-    "model_type": "qwen2 7B Q4_K - Medium",
-    "model_size": 4677120000,
-    "model_n_params": 7615616512,
+    "build_commit": "69f7e7116",
+    "build_number": 6321,
+    "cpu_info": "OpenBLAS, CPU",
+    "gpu_info": "",
+    "backends": "BLAS",
+    "model_filename": "models/granite-3.3-2b-instruct-be.IQ4_XS.gguf",
+    "model_type": "granite 3B IQ4_XS - 4.25 bpw",
+    "model_size": 1395281392,
+    "model_n_params": 2533539840,
     "n_batch": 2048,
     "n_ubatch": 512,
     "n_threads": 8,
@@ -264,18 +302,28 @@ $ ./llama-bench -o json
     "no_kv_offload": false,
     "flash_attn": false,
     "tensor_split": "0.00",
+    "tensor_buft_overrides": "none",
     "use_mmap": true,
     "embeddings": false,
+    "no_op_offload": 0,
     "n_prompt": 0,
     "n_gen": 128,
     "n_depth": 0,
-    "test_time": "2025-04-24T11:58:51Z",
-    "avg_ns": 1076767880,
-    "stddev_ns": 9449585,
-    "avg_ts": 118.881588,
-    "stddev_ts": 1.041811,
-    "samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ],
-    "samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ]
+    "test_time": "2025-08-28T17:02:12Z",
+    "avg_ns": 5613693967,
+    "stddev_ns": 9159226,
+    "avg_ts": 22.801434,
+    "stddev_ts": 0.037157,
+    "avg_ttft_ms": 43.766002,
+    "stddev_ttft_ms": 0.161696,
+    "avg_e2e_ms": 5613.693967,
+    "stddev_e2e_ms": 9.158920,
+    "avg_itl_ms": 43.857701,
+    "stddev_itl_ms": 0.073127,
+    "samples_ns": [ 5617273869, 5609876644, 5628089934, 5607197607, 5606031783 ],
+    "samples_ts": [ 22.7869, 22.8169, 22.7431, 22.8278, 22.8325 ],
+    "samples_ttft_ns": [ 43584301, 43738022, 43642511, 43936642, 43928534 ],
+    "samples_itl_ns": [ 4.38873e+07, 4.38279e+07, 4.3972e+07, 4.38052e+07, 4.37961e+07 ]
   }
 ]
 ```
@@ -288,8 +336,8 @@ $ ./llama-bench -o jsonl
 ```
 
 ```json lines
-{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
-{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
+{"build_commit": "69f7e7116", "build_number": 6321, "cpu_info": "OpenBLAS, CPU", "gpu_info": "", "backends": "BLAS", "model_filename": "models/granite-3.3-2b-instruct-be.IQ4_XS.gguf", "model_type": "granite 3B IQ4_XS - 4.25 bpw", "model_size": 1395281392, "model_n_params": 2533539840, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "embeddings": false, "no_op_offload": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-08-28T17:05:56Z", "avg_ns": 6287855687, "stddev_ns": 24678326, "avg_ts": 81.427806, "stddev_ts": 0.318701, "avg_ttft_ms": 0.000000, "stddev_ttft_ms": 0.000000, "avg_e2e_ms": 6287.855687, "stddev_e2e_ms": 24.678263, "avg_itl_ms": 0.000000, "stddev_itl_ms": 0.000000, "samples_ns": [ 6278179171, 6280580749, 6291595583, 6327539008, 6261383925 ],"samples_ts": [ 81.5523, 81.5211, 81.3784, 80.9161, 81.7711 ],"samples_ttft_ns": [  ],"samples_itl_ns": [  ]}
+{"build_commit": "69f7e7116", "build_number": 6321, "cpu_info": "OpenBLAS, CPU", "gpu_info": "", "backends": "BLAS", "model_filename": "models/granite-3.3-2b-instruct-be.IQ4_XS.gguf", "model_type": "granite 3B IQ4_XS - 4.25 bpw", "model_size": 1395281392, "model_n_params": 2533539840, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "embeddings": false, "no_op_offload": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-08-28T17:06:33Z", "avg_ns": 5498943758, "stddev_ns": 137464807, "avg_ts": 23.288635, "stddev_ts": 0.571937, "avg_ttft_ms": 43.519547, "stddev_ttft_ms": 1.625999, "avg_e2e_ms": 5498.943758, "stddev_e2e_ms": 137.464807, "avg_itl_ms": 42.956096, "stddev_itl_ms": 1.075456, "samples_ns": [ 5563859512, 5712210875, 5398301124, 5410462401, 5409884878 ],"samples_ts": [ 23.0056, 22.4081, 23.7112, 23.6579, 23.6604 ],"samples_ttft_ns": [ 46182840, 43768889, 43176480, 42210650, 42258878 ],"samples_itl_ns": [ 4.34463e+07, 4.46334e+07, 4.21663e+07, 4.22697e+07, 4.22648e+07 ]}
 ```
 
 
@@ -302,7 +350,7 @@ $ ./llama-bench -o sql
 ```
 
 ```sql
-CREATE TABLE IF NOT EXISTS test (
+CREATE TABLE IF NOT EXISTS llama_bench (
   build_commit TEXT,
   build_number INTEGER,
   cpu_info TEXT,
@@ -326,8 +374,10 @@ CREATE TABLE IF NOT EXISTS test (
   no_kv_offload INTEGER,
   flash_attn INTEGER,
   tensor_split TEXT,
+  tensor_buft_overrides TEXT,
   use_mmap INTEGER,
   embeddings INTEGER,
+  no_op_offload INTEGER,
   n_prompt INTEGER,
   n_gen INTEGER,
   n_depth INTEGER,
@@ -335,9 +385,15 @@ CREATE TABLE IF NOT EXISTS test (
   avg_ns INTEGER,
   stddev_ns INTEGER,
   avg_ts REAL,
-  stddev_ts REAL
+  stddev_ts REAL,
+  avg_ttft_ms REAL,
+  stddev_ttft_ms REAL,
+  avg_e2e_ms REAL,
+  stddev_e2e_ms REAL,
+  avg_itl_ms REAL,
+  stddev_itl_ms REAL
 );
 
-INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
-INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
+INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, tensor_buft_overrides, use_mmap, embeddings, no_op_offload, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts, avg_ttft_ms, stddev_ttft_ms, avg_e2e_ms, stddev_e2e_ms, avg_itl_ms, stddev_itl_ms) VALUES ('69f7e7116', '6321', 'OpenBLAS, CPU', '', 'BLAS', 'models/granite-3.3-2b-instruct-be.IQ4_XS.gguf', 'granite 3B IQ4_XS - 4.25 bpw', '1395281392', '2533539840', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', 'none', '1', '0', '0', '512', '0', '0', '2025-08-28T17:08:13Z', '6326649352', '9345621', '80.927655', '0.119504', '0.000000', '0.000000', '6326.649353', '9.344944', '0.000000', '0.000000');
+INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, tensor_buft_overrides, use_mmap, embeddings, no_op_offload, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts, avg_ttft_ms, stddev_ttft_ms, avg_e2e_ms, stddev_e2e_ms, avg_itl_ms, stddev_itl_ms) VALUES ('69f7e7116', '6321', 'OpenBLAS, CPU', '', 'BLAS', 'models/granite-3.3-2b-instruct-be.IQ4_XS.gguf', 'granite 3B IQ4_XS - 4.25 bpw', '1395281392', '2533539840', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', 'none', '1', '0', '0', '0', '128', '0', '2025-08-28T17:08:51Z', '5640474378', '40328612', '22.694063', '0.163702', '44.155657', '0.086653', '5640.474378', '40.328543', '44.065502', '0.317725');
 ```
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 9378706a12a7c..b683265b0e1c6 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -1161,6 +1161,7 @@ struct test {
     int                      n_depth;
     std::string              test_time;
     std::vector<uint64_t>    samples_ns;
+    std::vector<uint64_t>    samples_ttft_ns;
 
     test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
         cpu_info(get_cpu_info()),
@@ -1205,6 +1206,10 @@ struct test {
 
     uint64_t stdev_ns() const { return ::stdev(samples_ns); }
 
+    uint64_t avg_ttft_ns() const { return ::avg(samples_ttft_ns); }
+
+    uint64_t stdev_ttft_ns() const { return ::stdev(samples_ttft_ns); }
+
     std::vector<double> get_ts() const {
         int                 n_tokens = n_prompt + n_gen;
         std::vector<double> ts;
@@ -1213,10 +1218,57 @@ struct test {
         return ts;
     }
 
+    std::vector<double> get_ttft_ms() const {
+        std::vector<double> ttft_ms;
+        std::transform(samples_ttft_ns.begin(), samples_ttft_ns.end(), std::back_inserter(ttft_ms),
+                       [](uint64_t t) { return t / 1e6; });
+        return ttft_ms;
+    }
+
+    std::vector<double> get_e2e_ms() const {
+        std::vector<double> e2e_ms;
+        std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(e2e_ms),
+                       [](uint64_t t) { return t / 1e6; });
+        return e2e_ms;
+    }
+
+    std::vector<double> get_itl_ns() const {
+        std::vector<double> itl_ns;
+        if (n_gen == 0) return itl_ns;
+
+        for (size_t i = 0; i < samples_ns.size(); i++) {
+            double e2e_ns = samples_ns[i];
+            double ttft_ns = samples_ttft_ns[i];
+            double itl = (e2e_ns - ttft_ns) / (n_gen - 1);
+            itl_ns.push_back(itl);
+        }
+        return itl_ns;
+    }
+
+    std::vector<double> get_itl_ms() const {
+        std::vector<double> itl_ns = get_itl_ns();
+        std::vector<double> itl_ms;
+        std::transform(itl_ns.begin(), itl_ns.end(), std::back_inserter(itl_ms),
+                       [](double t) { return t / 1e6; });
+        return itl_ms;
+    }
+
     double avg_ts() const { return ::avg(get_ts()); }
 
     double stdev_ts() const { return ::stdev(get_ts()); }
 
+    double avg_ttft_ms() const { return ::avg(get_ttft_ms()); }
+
+    double stdev_ttft_ms() const { return ::stdev(get_ttft_ms()); }
+
+    double avg_e2e_ms() const { return ::avg(get_e2e_ms()); }
+
+    double stdev_e2e_ms() const { return ::stdev(get_e2e_ms()); }
+
+    double avg_itl_ms() const { return ::avg(get_itl_ms()); }
+
+    double stdev_itl_ms() const { return ::stdev(get_itl_ms()); }
+
     static std::string get_backend() {
         std::vector<std::string> backends;
         for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
@@ -1231,12 +1283,13 @@ struct test {
 
     static const std::vector<std::string> & get_fields() {
         static const std::vector<std::string> fields = {
-            "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
-            "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
-            "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
-            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
-            "use_mmap",     "embeddings",   "no_op_offload",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
-            "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
+            "build_commit", "build_number",  "cpu_info",       "gpu_info",   "backends",     "model_filename",
+            "model_type",   "model_size",    "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
+            "cpu_mask",     "cpu_strict",    "poll",           "type_k",     "type_v",       "n_gpu_layers",
+            "split_mode",   "main_gpu",      "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
+            "use_mmap",     "embeddings",    "no_op_offload",  "n_prompt",   "n_gen",        "n_depth",      "test_time",
+            "avg_ns",       "stddev_ns",     "avg_ts",         "stddev_ts",  "avg_ttft_ms",  "stddev_ttft_ms",
+            "avg_e2e_ms",   "stddev_e2e_ms", "avg_itl_ms",     "stddev_itl_ms",
         };
         return fields;
     }
@@ -1244,17 +1297,18 @@ struct test {
     enum field_type { STRING, BOOL, INT, FLOAT };
 
     static field_type get_field_type(const std::string & field) {
-        if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
-            field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
-            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
-            field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
+        if (field == "build_number" || field == "n_batch"    || field == "n_ubatch"       || field == "n_threads"     ||
+            field == "poll"         || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers"  ||
+            field == "main_gpu"     || field == "n_prompt"   || field == "n_gen"          || field == "n_depth"       ||
+            field == "avg_ns"       || field == "stddev_ns"  || field == "no_op_offload") {
             return INT;
         }
-        if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
+        if (field == "f16_kv"   || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
             field == "use_mmap" || field == "embeddings") {
             return BOOL;
         }
-        if (field == "avg_ts" || field == "stddev_ts") {
+        if (field == "avg_ts"     || field == "stddev_ts"     || field == "avg_ttft_ms" || field == "stddev_ttft_ms" ||
+            field == "avg_e2e_ms" || field == "stddev_e2e_ms" || field == "avg_itl_ms"  || field == "stddev_itl_ms") {
             return FLOAT;
         }
         return STRING;
@@ -1331,7 +1385,13 @@ struct test {
                                             std::to_string(avg_ns()),
                                             std::to_string(stdev_ns()),
                                             std::to_string(avg_ts()),
-                                            std::to_string(stdev_ts()) };
+                                            std::to_string(stdev_ts()),
+                                            std::to_string(avg_ttft_ms()),
+                                            std::to_string(stdev_ttft_ms()),
+                                            std::to_string(avg_e2e_ms()),
+                                            std::to_string(stdev_e2e_ms()),
+                                            std::to_string(avg_itl_ms()),
+                                            std::to_string(stdev_itl_ms()) };
         return values;
     }
 
@@ -1440,7 +1500,9 @@ struct json_printer : public printer {
         fprintf(fout, "  {\n");
         print_fields(test::get_fields(), t.get_values());
         fprintf(fout, "    \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
-        fprintf(fout, "    \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "    \"samples_ts\": [ %s ],\n", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "    \"samples_ttft_ns\": [ %s ],\n", join(t.samples_ttft_ns, ", ").c_str());
+        fprintf(fout, "    \"samples_itl_ns\": [ %s ]\n", join(t.get_itl_ns(), ", ").c_str());
         fprintf(fout, "  }");
         fflush(fout);
     }
@@ -1460,7 +1522,9 @@ struct jsonl_printer : public printer {
         fprintf(fout, "{");
         print_fields(test::get_fields(), t.get_values());
         fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
-        fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "\"samples_ts\": [ %s ],", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "\"samples_ttft_ns\": [ %s ],", join(t.samples_ttft_ns, ", ").c_str());
+        fprintf(fout, "\"samples_itl_ns\": [ %s ]", join(t.get_itl_ns(), ", ").c_str());
         fprintf(fout, "}\n");
         fflush(fout);
     }
@@ -1761,7 +1825,7 @@ static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
     return true;
 }
 
-static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
+static bool test_gen(llama_context * ctx, int n_gen, int n_threads, uint64_t * t_ttft) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
     const llama_model * model   = llama_get_model(ctx);
@@ -1778,6 +1842,12 @@ static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
         }
         llama_synchronize(ctx);
         token = std::rand() % n_vocab;
+
+        // capture the time to first token
+        // t_ttft may be a nullptr from the warmup run
+        if (i == 0 && t_ttft != nullptr) {
+            *t_ttft = get_time_ns();
+        }
     }
     return true;
 }
@@ -1935,7 +2005,7 @@ int main(int argc, char ** argv) {
                 if (params.progress) {
                     fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
                 }
-                bool res = test_gen(ctx, 1, t.n_threads);
+                bool res = test_gen(ctx, 1, t.n_threads, nullptr);
                 if (!res) {
                     fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
                     exit(1);
@@ -1959,6 +2029,7 @@ int main(int argc, char ** argv) {
             }
 
             uint64_t t_start = get_time_ns();
+            uint64_t t_ttft  = 0;
 
             if (t.n_prompt > 0) {
                 if (params.progress) {
@@ -1976,11 +2047,12 @@ int main(int argc, char ** argv) {
                     fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
                             i + 1, params.reps);
                 }
-                bool res = test_gen(ctx, t.n_gen, t.n_threads);
+                bool res = test_gen(ctx, t.n_gen, t.n_threads, &t_ttft);
                 if (!res) {
                     fprintf(stderr, "%s: error: failed to run gen\n", __func__);
                     exit(1);
                 }
+                t.samples_ttft_ns.push_back(t_ttft - t_start);
             }
 
             uint64_t t_ns = get_time_ns() - t_start;