Skip to content

Commit 044d499

Browse files
committed
Llama-bench: allow benchmarking lora impact
1 parent 96f4053 commit 044d499

File tree

1 file changed

+36
-7
lines changed

1 file changed

+36
-7
lines changed

examples/llama-bench/llama-bench.cpp

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ static std::string pair_str(const std::pair<int, int> & p) {
157157

158158
struct cmd_params {
159159
std::vector<std::string> model;
160+
std::vector<std::string> lora;
160161
std::vector<int> n_prompt;
161162
std::vector<int> n_gen;
162163
std::vector<std::pair<int, int>> n_pg;
@@ -189,6 +190,7 @@ struct cmd_params {
189190

190191
static const cmd_params cmd_params_defaults = {
191192
/* model */ { "models/7B/ggml-model-q4_0.gguf" },
193+
/* lora */ { "none" },
192194
/* n_prompt */ { 512 },
193195
/* n_gen */ { 128 },
194196
/* n_pg */ {},
@@ -225,6 +227,7 @@ static void print_usage(int /* argc */, char ** argv) {
225227
printf("options:\n");
226228
printf(" -h, --help\n");
227229
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
230+
printf(" --lora <filename> (default: %s)\n", join(cmd_params_defaults.lora, ",").c_str());
228231
printf(" -p, --n-prompt <n> (default: %s)\n",
229232
join(cmd_params_defaults.n_prompt, ",").c_str());
230233
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
@@ -341,6 +344,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
341344
}
342345
auto p = string_split<std::string>(argv[i], split_delim);
343346
params.model.insert(params.model.end(), p.begin(), p.end());
347+
} else if (arg == "--lora") {
348+
if (++i >= argc) {
349+
invalid_param = true;
350+
break;
351+
}
352+
auto p = string_split<std::string>(argv[i], split_delim);
353+
params.lora.insert(params.lora.end(), p.begin(), p.end());
344354
} else if (arg == "-p" || arg == "--n-prompt") {
345355
if (++i >= argc) {
346356
invalid_param = true;
@@ -606,6 +616,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
606616
if (params.model.empty()) {
607617
params.model = cmd_params_defaults.model;
608618
}
619+
if (params.lora.empty()) {
620+
params.lora = cmd_params_defaults.lora;
621+
}
609622
if (params.n_prompt.empty()) {
610623
params.n_prompt = cmd_params_defaults.n_prompt;
611624
}
@@ -672,6 +685,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
672685

673686
struct cmd_params_instance {
674687
std::string model;
688+
std::string lora;
675689
int n_prompt;
676690
int n_gen;
677691
int n_batch;
@@ -737,7 +751,7 @@ struct cmd_params_instance {
737751
}
738752

739753
bool equal_mparams(const cmd_params_instance & other) const {
740-
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
754+
return model == other.model && lora == other.lora && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
741755
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
742756
tensor_split == other.tensor_split;
743757
}
@@ -764,6 +778,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
764778
// this ordering minimizes the number of times that each model needs to be reloaded
765779
// clang-format off
766780
for (const auto & m : params.model)
781+
for (const auto & l : params.lora)
767782
for (const auto & nl : params.n_gpu_layers)
768783
for (const auto & rpc : params.rpc_servers)
769784
for (const auto & sm : params.split_mode)
@@ -787,6 +802,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
787802
}
788803
cmd_params_instance instance = {
789804
/* .model = */ m,
805+
/* .lora = */ l,
790806
/* .n_prompt = */ n_prompt,
791807
/* .n_gen = */ 0,
792808
/* .n_batch = */ nb,
@@ -816,6 +832,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
816832
}
817833
cmd_params_instance instance = {
818834
/* .model = */ m,
835+
/* .lora = */ l,
819836
/* .n_prompt = */ 0,
820837
/* .n_gen = */ n_gen,
821838
/* .n_batch = */ nb,
@@ -845,6 +862,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
845862
}
846863
cmd_params_instance instance = {
847864
/* .model = */ m,
865+
/* .lora = */ l,
848866
/* .n_prompt = */ n_pg.first,
849867
/* .n_gen = */ n_pg.second,
850868
/* .n_batch = */ nb,
@@ -879,6 +897,7 @@ struct test {
879897
static const std::string cpu_info;
880898
static const std::string gpu_info;
881899
std::string model_filename;
900+
std::string lora_filename;
882901
std::string model_type;
883902
uint64_t model_size;
884903
uint64_t model_n_params;
@@ -905,6 +924,7 @@ struct test {
905924

906925
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
907926
model_filename = inst.model;
927+
lora_filename = inst.lora;
908928
char buf[128];
909929
llama_model_desc(lmodel, buf, sizeof(buf));
910930
model_type = buf;
@@ -966,12 +986,12 @@ struct test {
966986

967987
static const std::vector<std::string> & get_fields() {
968988
static const std::vector<std::string> fields = {
969-
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
970-
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
971-
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
972-
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
973-
"embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns",
974-
"avg_ts", "stddev_ts",
989+
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
990+
"lora_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch",
991+
"n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v",
992+
"n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split",
993+
"use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns",
994+
"stddev_ns", "avg_ts", "stddev_ts",
975995
};
976996
return fields;
977997
}
@@ -1017,6 +1037,7 @@ struct test {
10171037
gpu_info,
10181038
get_backend(),
10191039
model_filename,
1040+
lora_filename,
10201041
model_type,
10211042
std::to_string(model_size),
10221043
std::to_string(model_n_params),
@@ -1259,6 +1280,9 @@ struct markdown_printer : public printer {
12591280
void print_header(const cmd_params & params) override {
12601281
// select fields to print
12611282
fields.emplace_back("model");
1283+
if (params.lora.size() > 1 || (!params.lora.empty() && params.lora[0] != "none")) {
1284+
fields.emplace_back("lora");
1285+
}
12621286
fields.emplace_back("size");
12631287
fields.emplace_back("params");
12641288
fields.emplace_back("backend");
@@ -1337,6 +1361,8 @@ struct markdown_printer : public printer {
13371361
char buf[128];
13381362
if (field == "model") {
13391363
value = t.model_type;
1364+
} else if (field == "lora") {
1365+
value = t.lora_filename.empty() || t.lora_filename == "none" ? "N" : "Y";
13401366
} else if (field == "size") {
13411367
if (t.model_size < 1024 * 1024 * 1024) {
13421368
snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
@@ -1561,6 +1587,9 @@ int main(int argc, char ** argv) {
15611587
}
15621588

15631589
lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
1590+
if (!inst.lora.empty() && inst.lora != "none") {
1591+
llama_adapter_lora_init(lmodel, inst.lora.c_str());
1592+
}
15641593
if (lmodel == NULL) {
15651594
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
15661595
return 1;

0 commit comments

Comments
 (0)