From 8833f22e6360b751cde477a3e3b5039868c54ed4 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 31 Jul 2025 18:03:03 +0200
Subject: [PATCH] llama : add simple option to enable CPU for MoE weights
 (--cpu-moe)

---
 common/arg.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/common/arg.cpp b/common/arg.cpp
index 7744fd6c48876..ec7e865d425ec 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2373,6 +2373,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ));
+    add_opt(common_arg(
+        {"--cpu-moe"},
+        "use CPU for Mixture of Experts (MoE) weights",
+        [](common_params & params) {
+            params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$",   ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
+        }
+    ).set_env("LLAMA_ARG_CPU_MOE"));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",