Skip to content

Commit c7a04f2

Browse files
llama-bench: add --n-cpu-moe support
Support --n-cpu-moe in llama-bench the same way it is supported by llama-server. Fix the table by trimming tensor_buft_overrides output in markdown_printer. Refactor duplicated ffn_(up|down|gate)_exps regex into helper functions.
1 parent f4e664f commit c7a04f2

File tree

3 files changed

+71
-5
lines changed

3 files changed

+71
-5
lines changed

common/arg.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2437,7 +2437,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24372437
{"--cpu-moe", "-cmoe"},
24382438
"keep all Mixture of Experts (MoE) weights in the CPU",
24392439
[](common_params & params) {
2440-
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2440+
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
24412441
}
24422442
).set_env("LLAMA_ARG_CPU_MOE"));
24432443
add_opt(common_arg(
@@ -2450,7 +2450,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24502450
for (int i = 0; i < value; ++i) {
24512451
// keep strings alive and avoid leaking memory by storing them in a static vector
24522452
static std::list<std::string> buft_overrides;
2453-
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2453+
buft_overrides.push_back(llm_ffn_exps_block_regex(i));
24542454
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
24552455
}
24562456
}
@@ -2459,7 +2459,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24592459
{"--cpu-moe-draft", "-cmoed"},
24602460
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
24612461
[](common_params & params) {
2462-
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2462+
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
24632463
}
24642464
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
24652465
add_opt(common_arg(
@@ -2471,7 +2471,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24712471
}
24722472
for (int i = 0; i < value; ++i) {
24732473
static std::list<std::string> buft_overrides_draft;
2474-
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2474+
buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
24752475
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
24762476
}
24772477
}

common/common.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
733733

734734
}
735735

736+
//
737+
// MoE utils
738+
//
739+
740+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
741+
742+
inline std::string llm_ffn_exps_block_regex(int idx) {
743+
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
744+
}
745+
746+
inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
747+
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
748+
}
749+
736750
//
737751
// training utils
738752
//

tools/llama-bench/llama-bench.cpp

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ struct cmd_params {
250250
std::vector<bool> cpu_strict;
251251
std::vector<int> poll;
252252
std::vector<int> n_gpu_layers;
253+
std::vector<int> n_cpu_moe;
253254
std::vector<std::string> rpc_servers;
254255
std::vector<llama_split_mode> split_mode;
255256
std::vector<int> main_gpu;
@@ -286,6 +287,7 @@ static const cmd_params cmd_params_defaults = {
286287
/* cpu_strict */ { false },
287288
/* poll */ { 50 },
288289
/* n_gpu_layers */ { 99 },
290+
/* n_cpu_moe */ { 0 },
289291
/* rpc_servers */ { "" },
290292
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
291293
/* main_gpu */ { 0 },
@@ -353,6 +355,8 @@ static void print_usage(int /* argc */, char ** argv) {
353355
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
354356
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
355357
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
358+
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
359+
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
356360
if (llama_supports_rpc()) {
357361
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
358362
join(cmd_params_defaults.rpc_servers, ",").c_str());
@@ -564,6 +568,45 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
564568
}
565569
auto p = parse_int_range(argv[i]);
566570
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
571+
} else if (arg == "-ncmoe" || arg == "--n-cpu-moe") {
572+
if (++i >= argc) {
573+
invalid_param = true;
574+
break;
575+
}
576+
577+
const auto values = parse_int_range(argv[i]);
578+
if (values.size() != 1) {
579+
invalid_param = true;
580+
break;
581+
}
582+
583+
const int n_layers = values[0];
584+
if (n_layers < 0) {
585+
invalid_param = true;
586+
break;
587+
}
588+
589+
if (n_layers > 0) {
590+
static std::vector<std::vector<std::string> > buft_batches;
591+
buft_batches.emplace_back();
592+
std::vector<std::string> & batch = buft_batches.back();
593+
batch.reserve(static_cast<size_t>(n_layers));
594+
595+
std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides;
596+
group_tensor_buft_overrides.reserve(static_cast<size_t>(n_layers) + 1);
597+
598+
for (int i = 0; i < n_layers; ++i) {
599+
batch.push_back(llm_ffn_exps_block_regex(i));
600+
const char * pattern = batch.back().c_str();
601+
group_tensor_buft_overrides.push_back({
602+
pattern,
603+
ggml_backend_cpu_buffer_type()
604+
});
605+
}
606+
607+
group_tensor_buft_overrides.push_back({ nullptr, nullptr });
608+
params.tensor_buft_overrides.push_back(std::move(group_tensor_buft_overrides));
609+
}
567610
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
568611
if (++i >= argc) {
569612
invalid_param = true;
@@ -1514,6 +1557,9 @@ struct markdown_printer : public printer {
15141557
if (field == "no_op_offload") {
15151558
return 4;
15161559
}
1560+
if (field == "tensor_buft_overrides") {
1561+
return 40;
1562+
}
15171563

15181564
int width = std::max((int) field.length(), 10);
15191565

@@ -1683,7 +1729,13 @@ struct markdown_printer : public printer {
16831729
exit(1);
16841730
}
16851731

1686-
int width = get_field_width(field);
1732+
unsigned int width = get_field_width(field);
1733+
1734+
if (field == "tensor_buft_overrides") {
1735+
if (value.size() > width)
1736+
value.erase(width);
1737+
}
1738+
16871739
if (field == "t/s") {
16881740
// HACK: the utf-8 character is 2 bytes
16891741
width += 1;

0 commit comments

Comments
 (0)