llama : add --n-cpu-moe option

slaren · slaren · commit 260e0301033f · 2025-08-04T23:41:08.000+02:00
Keeps the MoE weights of the first N layers in the CPU
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2375,20 +2375,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                     }
                     throw std::invalid_argument("unknown buffer type");
                 }
-                // FIXME: this leaks memory
                 params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
             }
         }
     ));
     add_opt(common_arg(
-        {"--cpu-moe"},
-        "use CPU for Mixture of Experts (MoE) weights",
+        {"--cpu-moe", "-cmoe"},
+        "keep all Mixture of Experts (MoE) weights in the CPU",
         [](common_params & params) {
-            params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$",   ggml_backend_cpu_buffer_type()});
-            params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
-            params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps"), ggml_backend_cpu_buffer_type()});
         }
     ).set_env("LLAMA_ARG_CPU_MOE"));
+    add_opt(common_arg(
+        {"--n-cpu-moe", "-ncmoe"}, "N",
+        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
+        [](common_params & params, int value) {
+            if (value < 0) {
+                throw std::invalid_argument("invalid value");
+            }
+            for (int i = 0; i < value; ++i) {
+                params.tensor_buft_overrides.push_back({strdup(string_format("\\.%d\\.ffn_(up|down|gate)_exps", i).c_str()), ggml_backend_cpu_buffer_type()});
+            }
+        }
+    ).set_env("LLAMA_ARG_N_CPU_MOE"));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
diff --git a/common/common.cpp b/common/common.cpp
@@ -1565,3 +1565,9 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
 
     return result;
 }
+
+common_params::~common_params() {
+    for (auto & ot : tensor_buft_overrides) {
+        free(const_cast<char *>(ot.pattern));
+    }
+}
diff --git a/common/common.h b/common/common.h
@@ -241,6 +241,8 @@ enum common_reasoning_format {
 };
 
 struct common_params {
+    ~common_params();
+
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =  4096; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)

Original file line number	Diff line number	Diff line change
`@@ -1565,3 +1565,9 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std`
`1565`	`1565`
`1566`	`1566`	`return result;`
`1567`	`1567`	`}`
	`1568`	`+`
	`1569`	`+common_params::~common_params() {`
	`1570`	`+ for (auto & ot : tensor_buft_overrides) {`
	`1571`	`+ free(const_cast<char *>(ot.pattern));`
	`1572`	`+ }`
	`1573`	`+}`