Skip to content

Commit 33d9591

Browse files
llama-bench: add --n-cpu-moe support
Support --n-cpu-moe in llama-bench the same way it is supported by llama-server. Refactor duplicated ffn_(up|down|gate)_exps regex into helper functions.
1 parent f4e664f commit 33d9591

File tree

3 files changed

+90
-19
lines changed

3 files changed

+90
-19
lines changed

common/arg.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2437,7 +2437,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24372437
{"--cpu-moe", "-cmoe"},
24382438
"keep all Mixture of Experts (MoE) weights in the CPU",
24392439
[](common_params & params) {
2440-
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2440+
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
24412441
}
24422442
).set_env("LLAMA_ARG_CPU_MOE"));
24432443
add_opt(common_arg(
@@ -2450,7 +2450,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24502450
for (int i = 0; i < value; ++i) {
24512451
// keep strings alive and avoid leaking memory by storing them in a static vector
24522452
static std::list<std::string> buft_overrides;
2453-
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2453+
buft_overrides.push_back(llm_ffn_exps_block_regex(i));
24542454
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
24552455
}
24562456
}
@@ -2459,7 +2459,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24592459
{"--cpu-moe-draft", "-cmoed"},
24602460
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
24612461
[](common_params & params) {
2462-
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2462+
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
24632463
}
24642464
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
24652465
add_opt(common_arg(
@@ -2471,7 +2471,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24712471
}
24722472
for (int i = 0; i < value; ++i) {
24732473
static std::list<std::string> buft_overrides_draft;
2474-
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2474+
buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
24752475
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
24762476
}
24772477
}

common/common.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
733733

734734
}
735735

736+
//
737+
// MoE utils
738+
//
739+
740+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
741+
742+
inline std::string llm_ffn_exps_block_regex(int idx) {
743+
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
744+
}
745+
746+
inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
747+
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
748+
}
749+
736750
//
737751
// training utils
738752
//

tools/llama-bench/llama-bench.cpp

Lines changed: 72 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ struct cmd_params {
250250
std::vector<bool> cpu_strict;
251251
std::vector<int> poll;
252252
std::vector<int> n_gpu_layers;
253+
std::vector<int> n_cpu_moe;
253254
std::vector<std::string> rpc_servers;
254255
std::vector<llama_split_mode> split_mode;
255256
std::vector<int> main_gpu;
@@ -286,6 +287,7 @@ static const cmd_params cmd_params_defaults = {
286287
/* cpu_strict */ { false },
287288
/* poll */ { 50 },
288289
/* n_gpu_layers */ { 99 },
290+
/* n_cpu_moe */ { 0 },
289291
/* rpc_servers */ { "" },
290292
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
291293
/* main_gpu */ { 0 },
@@ -353,6 +355,8 @@ static void print_usage(int /* argc */, char ** argv) {
353355
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
354356
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
355357
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
358+
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
359+
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
356360
if (llama_supports_rpc()) {
357361
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
358362
join(cmd_params_defaults.rpc_servers, ",").c_str());
@@ -564,6 +568,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
564568
}
565569
auto p = parse_int_range(argv[i]);
566570
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
571+
} else if (arg == "-ncmoe" || arg == "--n-cpu-moe") {
572+
if (++i >= argc) {
573+
invalid_param = true;
574+
break;
575+
}
576+
auto p = parse_int_range(argv[i]);
577+
params.n_cpu_moe.insert(params.n_cpu_moe.end(), p.begin(), p.end());
567578
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
568579
if (++i >= argc) {
569580
invalid_param = true;
@@ -841,6 +852,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
841852
if (params.n_gpu_layers.empty()) {
842853
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
843854
}
855+
if (params.n_cpu_moe.empty()) {
856+
params.n_cpu_moe = cmd_params_defaults.n_cpu_moe;
857+
}
844858
if (params.rpc_servers.empty()) {
845859
params.rpc_servers = cmd_params_defaults.rpc_servers;
846860
}
@@ -901,6 +915,7 @@ struct cmd_params_instance {
901915
bool cpu_strict;
902916
int poll;
903917
int n_gpu_layers;
918+
int n_cpu_moe;
904919
std::string rpc_servers_str;
905920
llama_split_mode split_mode;
906921
int main_gpu;
@@ -973,20 +988,50 @@ struct cmd_params_instance {
973988
mparams.tensor_split = tensor_split.data();
974989
mparams.use_mmap = use_mmap;
975990

976-
if (tensor_buft_overrides.empty()) {
977-
mparams.tensor_buft_overrides = nullptr;
991+
if (n_cpu_moe <= 0) {
992+
if (tensor_buft_overrides.empty()) {
993+
mparams.tensor_buft_overrides = nullptr;
994+
} else {
995+
GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr &&
996+
"Tensor buffer overrides not terminated with empty pattern");
997+
mparams.tensor_buft_overrides = tensor_buft_overrides.data();
998+
}
978999
} else {
979-
GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
980-
mparams.tensor_buft_overrides = tensor_buft_overrides.data();
1000+
static std::vector<llama_model_tensor_buft_override> merged;
1001+
static std::vector<std::string> patterns;
1002+
1003+
merged.clear();
1004+
patterns.clear();
1005+
1006+
auto first = tensor_buft_overrides.begin();
1007+
auto last = tensor_buft_overrides.end();
1008+
if (first != last && (last - 1)->pattern == nullptr) {
1009+
--last;
1010+
}
1011+
merged.insert(merged.end(), first, last);
1012+
1013+
patterns.reserve((size_t) n_cpu_moe);
1014+
merged.reserve(merged.size() + (size_t) n_cpu_moe + 1);
1015+
1016+
for (int i = 0; i < n_cpu_moe; ++i) {
1017+
patterns.push_back(llm_ffn_exps_block_regex(i));
1018+
merged.push_back({ patterns.back().c_str(),
1019+
ggml_backend_cpu_buffer_type() });
1020+
}
1021+
1022+
merged.push_back({ nullptr, nullptr });
1023+
1024+
mparams.tensor_buft_overrides = merged.data();
9811025
}
9821026

9831027
return mparams;
9841028
}
9851029

9861030
bool equal_mparams(const cmd_params_instance & other) const {
987-
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
988-
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
989-
tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
1031+
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
1032+
rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode &&
1033+
main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
1034+
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
9901035
}
9911036

9921037
llama_context_params to_llama_cparams() const {
@@ -1014,6 +1059,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10141059
// clang-format off
10151060
for (const auto & m : params.model)
10161061
for (const auto & nl : params.n_gpu_layers)
1062+
for (const auto & ncmoe : params.n_cpu_moe)
10171063
for (const auto & rpc : params.rpc_servers)
10181064
for (const auto & sm : params.split_mode)
10191065
for (const auto & mg : params.main_gpu)
@@ -1051,6 +1097,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10511097
/* .cpu_strict = */ cs,
10521098
/* .poll = */ pl,
10531099
/* .n_gpu_layers = */ nl,
1100+
/* .n_cpu_moe = */ ncmoe,
10541101
/* .rpc_servers = */ rpc,
10551102
/* .split_mode = */ sm,
10561103
/* .main_gpu = */ mg,
@@ -1083,6 +1130,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10831130
/* .cpu_strict = */ cs,
10841131
/* .poll = */ pl,
10851132
/* .n_gpu_layers = */ nl,
1133+
/* .n_cpu_moe = */ ncmoe,
10861134
/* .rpc_servers = */ rpc,
10871135
/* .split_mode = */ sm,
10881136
/* .main_gpu = */ mg,
@@ -1115,6 +1163,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
11151163
/* .cpu_strict = */ cs,
11161164
/* .poll = */ pl,
11171165
/* .n_gpu_layers = */ nl,
1166+
/* .n_cpu_moe = */ ncmoe,
11181167
/* .rpc_servers = */ rpc,
11191168
/* .split_mode = */ sm,
11201169
/* .main_gpu = */ mg,
@@ -1152,6 +1201,7 @@ struct test {
11521201
ggml_type type_k;
11531202
ggml_type type_v;
11541203
int n_gpu_layers;
1204+
int n_cpu_moe;
11551205
llama_split_mode split_mode;
11561206
int main_gpu;
11571207
bool no_kv_offload;
@@ -1186,6 +1236,7 @@ struct test {
11861236
type_k = inst.type_k;
11871237
type_v = inst.type_v;
11881238
n_gpu_layers = inst.n_gpu_layers;
1239+
n_cpu_moe = inst.n_cpu_moe;
11891240
split_mode = inst.split_mode;
11901241
main_gpu = inst.main_gpu;
11911242
no_kv_offload = inst.no_kv_offload;
@@ -1236,12 +1287,14 @@ struct test {
12361287

12371288
static const std::vector<std::string> & get_fields() {
12381289
static const std::vector<std::string> fields = {
1239-
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
1240-
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
1241-
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
1242-
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
1243-
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
1244-
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
1290+
"build_commit", "build_number", "cpu_info", "gpu_info", "backends",
1291+
"model_filename", "model_type", "model_size", "model_n_params", "n_batch",
1292+
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
1293+
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
1294+
"main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
1295+
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen",
1296+
"n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts",
1297+
"stddev_ts"
12451298
};
12461299
return fields;
12471300
}
@@ -1251,8 +1304,8 @@ struct test {
12511304
static field_type get_field_type(const std::string & field) {
12521305
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
12531306
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
1254-
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
1255-
field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
1307+
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" ||
1308+
field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") {
12561309
return INT;
12571310
}
12581311
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
@@ -1320,6 +1373,7 @@ struct test {
13201373
ggml_type_name(type_k),
13211374
ggml_type_name(type_v),
13221375
std::to_string(n_gpu_layers),
1376+
std::to_string(n_cpu_moe),
13231377
split_mode_str(split_mode),
13241378
std::to_string(main_gpu),
13251379
std::to_string(no_kv_offload),
@@ -1568,6 +1622,9 @@ struct markdown_printer : public printer {
15681622
if (!is_cpu_backend) {
15691623
fields.emplace_back("n_gpu_layers");
15701624
}
1625+
if (params.n_cpu_moe.size() > 1) {
1626+
fields.emplace_back("n_cpu_moe");
1627+
}
15711628
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
15721629
fields.emplace_back("n_threads");
15731630
}

0 commit comments

Comments
 (0)