Add benchmark metrics to the llama.cli.

zoq · zoq · commit aa3b5a4e8184 · 2025-10-21T14:41:25.000-04:00
Signed-off-by: Marcus Edel &lt;marcus.edel@collabora.com&gt;
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1663,6 +1663,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_print = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"--bench"}, "N",
+        string_format("repeat the full generation N times and report aggregate throughput (default: %d)", params.bench_runs),
+        [](common_params & params, int value) {
+            params.bench_runs = std::max(1, value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--prompt-cache"}, "FNAME",
         "file to cache prompt state for faster startup (default: none)",
diff --git a/common/common.h b/common/common.h
@@ -284,6 +284,7 @@ struct common_params {
     int32_t grp_attn_n            =     1; // group-attention factor
     int32_t grp_attn_w            =   512; // group-attention width
     int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
+    int32_t bench_runs            =     1; // repeat full generation runs for benchmarking
     float   rope_freq_base        =  0.0f; // RoPE base frequency
     float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
     float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
diff --git a/tools/main/main.cpp b/tools/main/main.cpp