Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 36 additions & 7 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ static std::string pair_str(const std::pair<int, int> & p) {

struct cmd_params {
std::vector<std::string> model;
std::vector<std::string> lora;
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<std::pair<int, int>> n_pg;
Expand Down Expand Up @@ -189,6 +190,7 @@ struct cmd_params {

static const cmd_params cmd_params_defaults = {
/* model */ { "models/7B/ggml-model-q4_0.gguf" },
/* lora */ { "none" },
/* n_prompt */ { 512 },
/* n_gen */ { 128 },
/* n_pg */ {},
Expand Down Expand Up @@ -225,6 +227,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf("options:\n");
printf(" -h, --help\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" --lora <filename> (default: %s)\n", join(cmd_params_defaults.lora, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n",
join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
Expand Down Expand Up @@ -341,6 +344,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<std::string>(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.lora.insert(params.lora.end(), p.begin(), p.end());
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -606,6 +616,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.model.empty()) {
params.model = cmd_params_defaults.model;
}
if (params.lora.empty()) {
params.lora = cmd_params_defaults.lora;
}
if (params.n_prompt.empty()) {
params.n_prompt = cmd_params_defaults.n_prompt;
}
Expand Down Expand Up @@ -672,6 +685,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {

struct cmd_params_instance {
std::string model;
std::string lora;
int n_prompt;
int n_gen;
int n_batch;
Expand Down Expand Up @@ -737,7 +751,7 @@ struct cmd_params_instance {
}

bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
return model == other.model && lora == other.lora && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
tensor_split == other.tensor_split;
}
Expand All @@ -764,6 +778,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
// this ordering minimizes the number of times that each model needs to be reloaded
// clang-format off
for (const auto & m : params.model)
for (const auto & l : params.lora)
for (const auto & nl : params.n_gpu_layers)
for (const auto & rpc : params.rpc_servers)
for (const auto & sm : params.split_mode)
Expand All @@ -787,6 +802,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
}
cmd_params_instance instance = {
/* .model = */ m,
/* .lora = */ l,
/* .n_prompt = */ n_prompt,
/* .n_gen = */ 0,
/* .n_batch = */ nb,
Expand Down Expand Up @@ -816,6 +832,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
}
cmd_params_instance instance = {
/* .model = */ m,
/* .lora = */ l,
/* .n_prompt = */ 0,
/* .n_gen = */ n_gen,
/* .n_batch = */ nb,
Expand Down Expand Up @@ -845,6 +862,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
}
cmd_params_instance instance = {
/* .model = */ m,
/* .lora = */ l,
/* .n_prompt = */ n_pg.first,
/* .n_gen = */ n_pg.second,
/* .n_batch = */ nb,
Expand Down Expand Up @@ -879,6 +897,7 @@ struct test {
static const std::string cpu_info;
static const std::string gpu_info;
std::string model_filename;
std::string lora_filename;
std::string model_type;
uint64_t model_size;
uint64_t model_n_params;
Expand All @@ -905,6 +924,7 @@ struct test {

test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
model_filename = inst.model;
lora_filename = inst.lora;
char buf[128];
llama_model_desc(lmodel, buf, sizeof(buf));
model_type = buf;
Expand Down Expand Up @@ -966,12 +986,12 @@ struct test {

static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = {
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
"embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns",
"avg_ts", "stddev_ts",
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
"lora_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch",
"n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v",
"n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split",
"use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns",
"stddev_ns", "avg_ts", "stddev_ts",
};
return fields;
}
Expand Down Expand Up @@ -1017,6 +1037,7 @@ struct test {
gpu_info,
get_backend(),
model_filename,
lora_filename,
model_type,
std::to_string(model_size),
std::to_string(model_n_params),
Expand Down Expand Up @@ -1259,6 +1280,9 @@ struct markdown_printer : public printer {
void print_header(const cmd_params & params) override {
// select fields to print
fields.emplace_back("model");
if (params.lora.size() > 1 || (!params.lora.empty() && params.lora[0] != "none")) {
fields.emplace_back("lora");
}
fields.emplace_back("size");
fields.emplace_back("params");
fields.emplace_back("backend");
Expand Down Expand Up @@ -1337,6 +1361,8 @@ struct markdown_printer : public printer {
char buf[128];
if (field == "model") {
value = t.model_type;
} else if (field == "lora") {
value = t.lora_filename.empty() || t.lora_filename == "none" ? "N" : "Y";
} else if (field == "size") {
if (t.model_size < 1024 * 1024 * 1024) {
snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
Expand Down Expand Up @@ -1561,6 +1587,9 @@ int main(int argc, char ** argv) {
}

lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
if (!inst.lora.empty() && inst.lora != "none") {
llama_adapter_lora_init(lmodel, inst.lora.c_str());
}
if (lmodel == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
return 1;
Expand Down
Loading