| 
26 | 26 | #include <cstdarg>  | 
27 | 27 | #include <filesystem>  | 
28 | 28 | #include <fstream>  | 
 | 29 | +#include <list>  | 
29 | 30 | #include <regex>  | 
30 | 31 | #include <set>  | 
31 | 32 | #include <string>  | 
@@ -2374,20 +2375,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex  | 
2374 | 2375 |                     }  | 
2375 | 2376 |                     throw std::invalid_argument("unknown buffer type");  | 
2376 | 2377 |                 }  | 
2377 |  | -                // FIXME: this leaks memory  | 
2378 |  | -                params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});  | 
 | 2378 | +                // keep strings alive and avoid leaking memory by storing them in a static vector  | 
 | 2379 | +                static std::list<std::string> buft_overrides;  | 
 | 2380 | +                buft_overrides.push_back(tensor_name);  | 
 | 2381 | +                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});  | 
2379 | 2382 |             }  | 
2380 | 2383 |         }  | 
2381 | 2384 |     ));  | 
2382 | 2385 |     add_opt(common_arg(  | 
2383 |  | -        {"--cpu-moe"},  | 
2384 |  | -        "use CPU for Mixture of Experts (MoE) weights",  | 
 | 2386 | +        {"--cpu-moe", "-cmoe"},  | 
 | 2387 | +        "keep all Mixture of Experts (MoE) weights in the CPU",  | 
2385 | 2388 |         [](common_params & params) {  | 
2386 |  | -            params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$",   ggml_backend_cpu_buffer_type()});  | 
2387 |  | -            params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});  | 
2388 |  | -            params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});  | 
 | 2389 | +            params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});  | 
2389 | 2390 |         }  | 
2390 | 2391 |     ).set_env("LLAMA_ARG_CPU_MOE"));  | 
 | 2392 | +    add_opt(common_arg(  | 
 | 2393 | +        {"--n-cpu-moe", "-ncmoe"}, "N",  | 
 | 2394 | +        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",  | 
 | 2395 | +        [](common_params & params, int value) {  | 
 | 2396 | +            if (value < 0) {  | 
 | 2397 | +                throw std::invalid_argument("invalid value");  | 
 | 2398 | +            }  | 
 | 2399 | +            for (int i = 0; i < value; ++i) {  | 
 | 2400 | +                // keep strings alive and avoid leaking memory by storing them in a static vector  | 
 | 2401 | +                static std::list<std::string> buft_overrides;  | 
 | 2402 | +                buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));  | 
 | 2403 | +                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});  | 
 | 2404 | +            }  | 
 | 2405 | +        }  | 
 | 2406 | +    ).set_env("LLAMA_ARG_N_CPU_MOE"));  | 
2391 | 2407 |     add_opt(common_arg(  | 
2392 | 2408 |         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",  | 
2393 | 2409 |         "number of layers to store in VRAM",  | 
 | 
0 commit comments