Skip to content

Commit aa3b5a4

Browse files
committed
Add benchmark metrics to the llama.cli.
Signed-off-by: Marcus Edel <[email protected]>
1 parent 5545231 commit aa3b5a4

File tree

3 files changed

+312
-7
lines changed

3 files changed

+312
-7
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1663,6 +1663,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
16631663
params.n_print = value;
16641664
}
16651665
).set_examples({LLAMA_EXAMPLE_MAIN}));
1666+
add_opt(common_arg(
1667+
{"--bench"}, "N",
1668+
string_format("repeat the full generation N times and report aggregate throughput (default: %d)", params.bench_runs),
1669+
[](common_params & params, int value) {
1670+
params.bench_runs = std::max(1, value);
1671+
}
1672+
).set_examples({LLAMA_EXAMPLE_MAIN}));
16661673
add_opt(common_arg(
16671674
{"--prompt-cache"}, "FNAME",
16681675
"file to cache prompt state for faster startup (default: none)",

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ struct common_params {
284284
int32_t grp_attn_n = 1; // group-attention factor
285285
int32_t grp_attn_w = 512; // group-attention width
286286
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
287+
int32_t bench_runs = 1; // repeat full generation runs for benchmarking
287288
float rope_freq_base = 0.0f; // RoPE base frequency
288289
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
289290
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor

0 commit comments

Comments
 (0)