Skip to content

Commit 260e030

Browse files
committed
llama : add --n-cpu-moe option
Keeps the MoE weights of the first N layers in the CPU
1 parent ef0144c commit 260e030

File tree

3 files changed

+23
-6
lines changed

3 files changed

+23
-6
lines changed

common/arg.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2375,20 +2375,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23752375
}
23762376
throw std::invalid_argument("unknown buffer type");
23772377
}
2378-
// FIXME: this leaks memory
23792378
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
23802379
}
23812380
}
23822381
));
23832382
add_opt(common_arg(
2384-
{"--cpu-moe"},
2385-
"use CPU for Mixture of Experts (MoE) weights",
2383+
{"--cpu-moe", "-cmoe"},
2384+
"keep all Mixture of Experts (MoE) weights in the CPU",
23862385
[](common_params & params) {
2387-
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2388-
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2389-
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2386+
params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps"), ggml_backend_cpu_buffer_type()});
23902387
}
23912388
).set_env("LLAMA_ARG_CPU_MOE"));
2389+
add_opt(common_arg(
2390+
{"--n-cpu-moe", "-ncmoe"}, "N",
2391+
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2392+
[](common_params & params, int value) {
2393+
if (value < 0) {
2394+
throw std::invalid_argument("invalid value");
2395+
}
2396+
for (int i = 0; i < value; ++i) {
2397+
params.tensor_buft_overrides.push_back({strdup(string_format("\\.%d\\.ffn_(up|down|gate)_exps", i).c_str()), ggml_backend_cpu_buffer_type()});
2398+
}
2399+
}
2400+
).set_env("LLAMA_ARG_N_CPU_MOE"));
23922401
add_opt(common_arg(
23932402
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
23942403
"number of layers to store in VRAM",

common/common.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1565,3 +1565,9 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
15651565

15661566
return result;
15671567
}
1568+
1569+
common_params::~common_params() {
1570+
for (auto & ot : tensor_buft_overrides) {
1571+
free(const_cast<char *>(ot.pattern));
1572+
}
1573+
}

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,8 @@ enum common_reasoning_format {
241241
};
242242

243243
struct common_params {
244+
~common_params();
245+
244246
int32_t n_predict = -1; // new tokens to predict
245247
int32_t n_ctx = 4096; // context size
246248
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)

0 commit comments

Comments
 (0)