From fa54b58caa1cf55eecf6cc8e575de0c9f817619c Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Fri, 6 Jun 2025 09:04:52 +0300 Subject: [PATCH 1/2] Check if ffn_up and ffn_gate are of the same type before using fmoe --- src/llama.cpp | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index be404500e..5433d77f0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3447,6 +3447,26 @@ static bool llama_kv_cache_init( buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; } + if (cparams.fused_moe_up_gate) { + int nbad = 0; + for (int i = 0; i < (int) n_layer; i++) { + auto& layer = model.layers[i]; + if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) { + ++nbad; + } + } + if (nbad > 0) { + if (nbad == (int)n_layer) { + LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n"); + const_cast(cparams).fused_moe_up_gate = false; + } + else { + LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n", + nbad, (int)n_layer); + } + } + } + // create a context for each buffer type std::map ctx_map; for (auto & it : buft_layer_count) { @@ -9841,7 +9861,7 @@ llm_expert_gating_func_type gating_op, } ggml_tensor * par; - if (lctx.cparams.fused_moe_up_gate) { + if (lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) { par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU); } else { ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] From fdfad307218eee25dd048b1cde02a70f71a27e4d Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Fri, 6 Jun 2025 12:05:10 +0300 Subject: [PATCH 2/2] Just leave the check. --- src/llama.cpp | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5433d77f0..87a6214ba 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3447,25 +3447,25 @@ static bool llama_kv_cache_init( buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; } - if (cparams.fused_moe_up_gate) { - int nbad = 0; - for (int i = 0; i < (int) n_layer; i++) { - auto& layer = model.layers[i]; - if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) { - ++nbad; - } - } - if (nbad > 0) { - if (nbad == (int)n_layer) { - LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n"); - const_cast(cparams).fused_moe_up_gate = false; - } - else { - LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n", - nbad, (int)n_layer); - } - } - } + //if (cparams.fused_moe_up_gate) { + // int nbad = 0; + // for (int i = 0; i < (int) n_layer; i++) { + // auto& layer = model.layers[i]; + // if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) { + // ++nbad; + // } + // } + // if (nbad > 0) { + // if (nbad == (int)n_layer) { + // LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n"); + // const_cast(cparams).fused_moe_up_gate = false; + // } + // else { + // LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n", + // nbad, (int)n_layer); + // } + // } + //} // create a context for each buffer type std::map ctx_map;