|
24 | 24 | #include <cstdarg>
|
25 | 25 | #include <filesystem>
|
26 | 26 | #include <fstream>
|
| 27 | +#include <list> |
27 | 28 | #include <regex>
|
28 | 29 | #include <set>
|
29 | 30 | #include <string>
|
@@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
2375 | 2376 | }
|
2376 | 2377 | throw std::invalid_argument("unknown buffer type");
|
2377 | 2378 | }
|
2378 |
| - // FIXME: this leaks memory |
2379 |
| - params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)}); |
| 2379 | + // keep strings alive and avoid leaking memory by storing them in a static vector |
| 2380 | + static std::list<std::string> buft_overrides; |
| 2381 | + buft_overrides.push_back(tensor_name); |
| 2382 | + params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)}); |
2380 | 2383 | }
|
2381 | 2384 | }
|
2382 | 2385 | ));
|
2383 | 2386 | add_opt(common_arg(
|
2384 |
| - {"--cpu-moe"}, |
2385 |
| - "use CPU for Mixture of Experts (MoE) weights", |
| 2387 | + {"--cpu-moe", "-cmoe"}, |
| 2388 | + "keep all Mixture of Experts (MoE) weights in the CPU", |
2386 | 2389 | [](common_params & params) {
|
2387 |
| - params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()}); |
2388 |
| - params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()}); |
2389 |
| - params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()}); |
| 2390 | + params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); |
2390 | 2391 | }
|
2391 | 2392 | ).set_env("LLAMA_ARG_CPU_MOE"));
|
| 2393 | + add_opt(common_arg( |
| 2394 | + {"--n-cpu-moe", "-ncmoe"}, "N", |
| 2395 | + "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU", |
| 2396 | + [](common_params & params, int value) { |
| 2397 | + if (value < 0) { |
| 2398 | + throw std::invalid_argument("invalid value"); |
| 2399 | + } |
| 2400 | + for (int i = 0; i < value; ++i) { |
| 2401 | + // keep strings alive and avoid leaking memory by storing them in a static vector |
| 2402 | + static std::list<std::string> buft_overrides; |
| 2403 | + buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); |
| 2404 | + params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()}); |
| 2405 | + } |
| 2406 | + } |
| 2407 | + ).set_env("LLAMA_ARG_N_CPU_MOE")); |
2392 | 2408 | add_opt(common_arg(
|
2393 | 2409 | {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
2394 | 2410 | "number of layers to store in VRAM",
|
|
0 commit comments