Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1080,6 +1080,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
if (arg == "--cpu-moe" || arg == "-cmoe") {
params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps"), ggml_backend_cpu_buffer_type()});
return true;
}
if (arg == "--n-cpu-moe" || arg == "-ncmoe") {
CHECK_ARG
int32_t n_layers = std::stoi(argv[i]);
if (n_layers < 0) {
fprintf(stderr, "error: Invalid value for --n-cpu-moe: %d (must be >= 0)\n", n_layers);
invalid_param = true;
return true;
}
for (int32_t l = 0; l < n_layers; ++l) {
std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps)";
params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
}
return true;
}
if (arg == "--no-mmap") {
params.use_mmap = false;
return true;
Expand Down Expand Up @@ -1794,6 +1812,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
}
options.push_back({ "*", " --run-time-repack", "repack tensors if interleaved variant is available"});
options.push_back({ "*", " --cpu-moe", "keep all MoE weights in CPU memory"});
options.push_back({ "*", " --n-cpu-moe N", "keep MoE weights of the first N layers in CPU memory"});
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
" - distribute: spread execution evenly over all nodes\n"
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
Expand Down