@@ -2375,20 +2375,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23752375 }
23762376 throw std::invalid_argument (" unknown buffer type" );
23772377 }
2378- // FIXME: this leaks memory
23792378 params.tensor_buft_overrides .push_back ({strdup (tensor_name.c_str ()), buft_list.at (buffer_type)});
23802379 }
23812380 }
23822381 ));
23832382 add_opt (common_arg (
2384- {" --cpu-moe" },
2385- " use CPU for Mixture of Experts (MoE) weights" ,
2383+ {" --cpu-moe" , " -cmoe " },
2384+ " keep all Mixture of Experts (MoE) weights in the CPU " ,
23862385 [](common_params & params) {
2387- params.tensor_buft_overrides .push_back ({" \\ .ffn_up_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2388- params.tensor_buft_overrides .push_back ({" \\ .ffn_down_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2389- params.tensor_buft_overrides .push_back ({" \\ .ffn_gate_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2386+ params.tensor_buft_overrides .push_back ({strdup (" \\ .ffn_(up|down|gate)_exps" ), ggml_backend_cpu_buffer_type ()});
23902387 }
23912388 ).set_env (" LLAMA_ARG_CPU_MOE" ));
2389+ add_opt (common_arg (
2390+ {" --n-cpu-moe" , " -ncmoe" }, " N" ,
2391+ " keep the Mixture of Experts (MoE) weights of the first N layers in the CPU" ,
2392+ [](common_params & params, int value) {
2393+ if (value < 0 ) {
2394+ throw std::invalid_argument (" invalid value" );
2395+ }
2396+ for (int i = 0 ; i < value; ++i) {
2397+ params.tensor_buft_overrides .push_back ({strdup (string_format (" \\ .%d\\ .ffn_(up|down|gate)_exps" , i).c_str ()), ggml_backend_cpu_buffer_type ()});
2398+ }
2399+ }
2400+ ).set_env (" LLAMA_ARG_N_CPU_MOE" ));
23922401 add_opt (common_arg (
23932402 {" -ngl" , " --gpu-layers" , " --n-gpu-layers" }, " N" ,
23942403 " number of layers to store in VRAM" ,
0 commit comments