Skip to content

Commit 6228689

Browse files
Port cpu moe options from mainline ikawrakow#672
Co-Authored-By: Parsa <[email protected]>
1 parent 2de24e0 commit 6228689

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

common/common.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
11061106
}
11071107
return true;
11081108
}
1109+
if (arg == "--cpu-moe" || arg == "-cmoe") {
1110+
params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps"), ggml_backend_cpu_buffer_type()});
1111+
return true;
1112+
}
1113+
if (arg == "--n-cpu-moe" || arg == "-ncmoe") {
1114+
CHECK_ARG
1115+
int32_t n_layers = std::stoi(argv[i]);
1116+
if (n_layers < 0) {
1117+
fprintf(stderr, "error: Invalid value for --n-cpu-moe: %d (must be >= 0)\n", n_layers);
1118+
invalid_param = true;
1119+
return true;
1120+
}
1121+
for (int32_t l = 0; l < n_layers; ++l) {
1122+
std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps)";
1123+
params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
1124+
}
1125+
return true;
1126+
}
11091127
if (arg == "--no-mmap") {
11101128
params.use_mmap = false;
11111129
return true;
@@ -1820,6 +1838,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
18201838
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
18211839
}
18221840
options.push_back({ "*", " --run-time-repack", "repack tensors if interleaved variant is available"});
1841+
options.push_back({ "*", " --cpu-moe", "keep all MoE weights in CPU memory"});
1842+
options.push_back({ "*", " --n-cpu-moe N", "keep MoE weights of the first N layers in CPU memory"});
18231843
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
18241844
" - distribute: spread execution evenly over all nodes\n"
18251845
" - isolate: only spawn threads on CPUs on the node that execution started on\n"

0 commit comments

Comments
 (0)