llama-bench: add --n-cpu-moe support

jacekpoplawski · jacekpoplawski · commit c7a04f2a61a4 · 2025-09-12T18:28:06.000+02:00
Support --n-cpu-moe in llama-bench the same way it is supported by
llama-server.

Fix the table by trimming tensor_buft_overrides output in
markdown_printer.

Refactor duplicated ffn_(up|down|gate)_exps regex into helper functions.
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2437,7 +2437,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--cpu-moe", "-cmoe"},
         "keep all Mixture of Experts (MoE) weights in the CPU",
         [](common_params & params) {
-            params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
     ).set_env("LLAMA_ARG_CPU_MOE"));
     add_opt(common_arg(
@@ -2450,7 +2450,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             for (int i = 0; i < value; ++i) {
                 // keep strings alive and avoid leaking memory by storing them in a static vector
                 static std::list<std::string> buft_overrides;
-                buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
+                buft_overrides.push_back(llm_ffn_exps_block_regex(i));
                 params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
             }
         }
@@ -2459,7 +2459,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--cpu-moe-draft", "-cmoed"},
         "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
         [](common_params & params) {
-            params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
+            params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
     add_opt(common_arg(
@@ -2471,7 +2471,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             for (int i = 0; i < value; ++i) {
                 static std::list<std::string> buft_overrides_draft;
-                buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
+                buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
                 params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
             }
         }
diff --git a/common/common.h b/common/common.h
@@ -733,6 +733,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
 }
 
+//
+// MoE utils
+//
+
+const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
+
+inline std::string llm_ffn_exps_block_regex(int idx) {
+    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
+}
+
+inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
+    return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
+}
+
 //
 // training utils
 //
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
@@ -250,6 +250,7 @@ struct cmd_params {
     std::vector<bool>                cpu_strict;
     std::vector<int>                 poll;
     std::vector<int>                 n_gpu_layers;
+    std::vector<int>                 n_cpu_moe;
     std::vector<std::string>         rpc_servers;
     std::vector<llama_split_mode>    split_mode;
     std::vector<int>                 main_gpu;
@@ -286,6 +287,7 @@ static const cmd_params cmd_params_defaults = {
     /* cpu_strict           */ { false },
     /* poll                 */ { 50 },
     /* n_gpu_layers         */ { 99 },
+    /* n_cpu_moe            */ { 0 },
     /* rpc_servers          */ { "" },
     /* split_mode           */ { LLAMA_SPLIT_MODE_LAYER },
     /* main_gpu             */ { 0 },
@@ -353,6 +355,8 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
     printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n",
            join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    printf("  -ncmoe, --n-cpu-moe <n>                   (default: %s)\n",
+           join(cmd_params_defaults.n_cpu_moe, ",").c_str());
     if (llama_supports_rpc()) {
         printf("  -rpc, --rpc <rpc_servers>                 (default: %s)\n",
                join(cmd_params_defaults.rpc_servers, ",").c_str());
@@ -564,6 +568,45 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 }
                 auto p = parse_int_range(argv[i]);
                 params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
+            } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+
+                const auto values = parse_int_range(argv[i]);
+                if (values.size() != 1) {
+                    invalid_param = true;
+                    break;
+                }
+
+                const int n_layers = values[0];
+                if (n_layers < 0) {
+                    invalid_param = true;
+                    break;
+                }
+
+                if (n_layers > 0) {
+                    static std::vector<std::vector<std::string> > buft_batches;
+                    buft_batches.emplace_back();
+                    std::vector<std::string> & batch = buft_batches.back();
+                    batch.reserve(static_cast<size_t>(n_layers));
+
+                    std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides;
+                    group_tensor_buft_overrides.reserve(static_cast<size_t>(n_layers) + 1);
+
+                    for (int i = 0; i < n_layers; ++i) {
+                        batch.push_back(llm_ffn_exps_block_regex(i));
+                        const char * pattern = batch.back().c_str();
+                        group_tensor_buft_overrides.push_back({
+                            pattern,
+                            ggml_backend_cpu_buffer_type()
+                        });
+                    }
+
+                    group_tensor_buft_overrides.push_back({ nullptr, nullptr });
+                    params.tensor_buft_overrides.push_back(std::move(group_tensor_buft_overrides));
+                }
             } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
                 if (++i >= argc) {
                     invalid_param = true;
@@ -1514,6 +1557,9 @@ struct markdown_printer : public printer {
         if (field == "no_op_offload") {
             return 4;
         }
+        if (field == "tensor_buft_overrides") {
+            return 40;
+        }
 
         int width = std::max((int) field.length(), 10);
 
@@ -1683,7 +1729,13 @@ struct markdown_printer : public printer {
                 exit(1);
             }
 
-            int width = get_field_width(field);
+            unsigned int width = get_field_width(field);
+
+            if (field == "tensor_buft_overrides") {
+                if (value.size() > width)
+                    value.erase(width);
+            }
+
             if (field == "t/s") {
                 // HACK: the utf-8 character is 2 bytes
                 width += 1;

Original file line number	Diff line number	Diff line change
`@@ -2437,7 +2437,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2437`	`2437`	`{"--cpu-moe", "-cmoe"},`
`2438`	`2438`	`"keep all Mixture of Experts (MoE) weights in the CPU",`
`2439`	`2439`	`[](common_params & params) {`
`2440`		`- params.tensor_buft_overrides.push_back({"\\.ffn_(up\|down\|gate)_exps", ggml_backend_cpu_buffer_type()});`
	`2440`	`+ params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());`
`2441`	`2441`	`}`
`2442`	`2442`	`).set_env("LLAMA_ARG_CPU_MOE"));`
`2443`	`2443`	`add_opt(common_arg(`
`@@ -2450,7 +2450,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2450`	`2450`	`for (int i = 0; i < value; ++i) {`
`2451`	`2451`	`// keep strings alive and avoid leaking memory by storing them in a static vector`
`2452`	`2452`	`static std::list<std::string> buft_overrides;`
`2453`		`- buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up\|down\|gate)_exps", i));`
	`2453`	`+ buft_overrides.push_back(llm_ffn_exps_block_regex(i));`
`2454`	`2454`	`params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});`
`2455`	`2455`	`}`
`2456`	`2456`	`}`
`@@ -2459,7 +2459,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2459`	`2459`	`{"--cpu-moe-draft", "-cmoed"},`
`2460`	`2460`	`"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",`
`2461`	`2461`	`[](common_params & params) {`
`2462`		`- params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up\|down\|gate)_exps", ggml_backend_cpu_buffer_type()});`
	`2462`	`+ params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());`
`2463`	`2463`	`}`
`2464`	`2464`	`).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));`
`2465`	`2465`	`add_opt(common_arg(`
`@@ -2471,7 +2471,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2471`	`2471`	`}`
`2472`	`2472`	`for (int i = 0; i < value; ++i) {`
`2473`	`2473`	`static std::list<std::string> buft_overrides_draft;`
`2474`		`- buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up\|down\|gate)_exps", i));`
	`2474`	`+ buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));`
`2475`	`2475`	`params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});`
`2476`	`2476`	`}`
`2477`	`2477`	`}`