@@ -1080,6 +1080,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10801080 }
10811081 return true ;
10821082 }
1083+ if (arg == " --cpu-moe" || arg == " -cmoe" ) {
1084+ params.tensor_buft_overrides .push_back ({strdup (" \\ .ffn_(up|down|gate)_exps" ), ggml_backend_cpu_buffer_type ()});
1085+ return true ;
1086+ }
1087+ if (arg == " --n-cpu-moe" || arg == " -ncmoe" ) {
1088+ CHECK_ARG
1089+ int32_t n_layers = std::stoi (argv[i]);
1090+ if (n_layers < 0 ) {
1091+ fprintf (stderr, " error: Invalid value for --n-cpu-moe: %d (must be >= 0)\n " , n_layers);
1092+ invalid_param = true ;
1093+ return true ;
1094+ }
1095+ for (int32_t l = 0 ; l < n_layers; ++l) {
1096+ std::string pattern = " blk\\ ." + std::to_string (l) + " \\ .(ffn_(up|down|gate)_exps)" ;
1097+ params.tensor_buft_overrides .push_back ({strdup (pattern.c_str ()), ggml_backend_cpu_buffer_type ()});
1098+ }
1099+ return true ;
1100+ }
10831101 if (arg == " --no-mmap" ) {
10841102 params.use_mmap = false ;
10851103 return true ;
@@ -1794,6 +1812,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
17941812 options.push_back ({ " *" , " --no-mmap" , " do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
17951813 }
17961814 options.push_back ({ " *" , " --run-time-repack" , " repack tensors if interleaved variant is available" });
1815+ options.push_back ({ " *" , " --cpu-moe" , " keep all MoE weights in CPU memory" });
1816+ options.push_back ({ " *" , " --n-cpu-moe N" , " keep MoE weights of the first N layers in CPU memory" });
17971817 options.push_back ({ " *" , " --numa TYPE" , " attempt optimizations that help on some NUMA systems\n "
17981818 " - distribute: spread execution evenly over all nodes\n "
17991819 " - isolate: only spawn threads on CPUs on the node that execution started on\n "
0 commit comments