From fa54b58caa1cf55eecf6cc8e575de0c9f817619c Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Fri, 6 Jun 2025 09:04:52 +0300
Subject: [PATCH 1/2] Check if ffn_up and ffn_gate are of the same type before
 using fmoe

---
 src/llama.cpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index be404500e..5433d77f0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3447,6 +3447,26 @@ static bool llama_kv_cache_init(
         buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
     }
 
+    if (cparams.fused_moe_up_gate) {
+        int nbad = 0;
+        for (int i = 0; i < (int) n_layer; i++) {
+            auto& layer = model.layers[i];
+            if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) {
+                ++nbad;
+            }
+        }
+        if (nbad > 0) {
+            if (nbad == (int)n_layer) {
+                LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n");
+                const_cast<llama_cparams&>(cparams).fused_moe_up_gate = false;
+            }
+            else {
+                LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n",
+                        nbad, (int)n_layer);
+            }
+        }
+    }
+
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
     for (auto & it : buft_layer_count) {
@@ -9841,7 +9861,7 @@ llm_expert_gating_func_type   gating_op,
     }
 
     ggml_tensor * par;
-    if (lctx.cparams.fused_moe_up_gate) {
+    if (lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) {
         par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
     } else {
         ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]

From fdfad307218eee25dd048b1cde02a70f71a27e4d Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Fri, 6 Jun 2025 12:05:10 +0300
Subject: [PATCH 2/2] Just leave the check.

---
 src/llama.cpp | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 5433d77f0..87a6214ba 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3447,25 +3447,25 @@ static bool llama_kv_cache_init(
         buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
     }
 
-    if (cparams.fused_moe_up_gate) {
-        int nbad = 0;
-        for (int i = 0; i < (int) n_layer; i++) {
-            auto& layer = model.layers[i];
-            if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) {
-                ++nbad;
-            }
-        }
-        if (nbad > 0) {
-            if (nbad == (int)n_layer) {
-                LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n");
-                const_cast<llama_cparams&>(cparams).fused_moe_up_gate = false;
-            }
-            else {
-                LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n",
-                        nbad, (int)n_layer);
-            }
-        }
-    }
+    //if (cparams.fused_moe_up_gate) {
+    //    int nbad = 0;
+    //    for (int i = 0; i < (int) n_layer; i++) {
+    //        auto& layer = model.layers[i];
+    //        if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) {
+    //            ++nbad;
+    //        }
+    //    }
+    //    if (nbad > 0) {
+    //        if (nbad == (int)n_layer) {
+    //            LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n");
+    //            const_cast<llama_cparams&>(cparams).fused_moe_up_gate = false;
+    //        }
+    //        else {
+    //            LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n",
+    //                    nbad, (int)n_layer);
+    //        }
+    //    }
+    //}
 
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;