refactor: use standard build_moe_ffn instead of custom build_mergez_moe_ffn

tamarPal · tamarPal · commit 053234f91dc5 · 2025-11-07T00:52:26.000+02:00
- Remove custom build_mergez_moe_ffn implementation (100+ lines)
- Use existing build_moe_ffn with LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID
- Pre-compute gate logits from pre_gate_hidden (Megrez-MoE's unique gating)
- Pass pre-computed logits via probs_in parameter
- Maintain exact same behavior and output quality

This addresses review feedback to reuse existing MoE infrastructure
instead of duplicating code. The sigmoid gating + bias after activation
is already supported by build_moe_ffn.
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1140,100 +1140,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     return moe_out;
 }
 
-ggml_tensor * llm_graph_context::build_mergez_moe_ffn(
-         ggml_tensor * cur,
-         ggml_tensor * hidden_state,
-         ggml_tensor * gate_inp,
-         ggml_tensor * exp_probs_b,
-         ggml_tensor * up_exps,
-         ggml_tensor * gate_exps,
-         ggml_tensor * down_exps,
-             int64_t   n_expert,
-             int64_t   n_expert_used,
-                 int   il) const {
-    const int64_t n_embd   = cur->ne[0];
-    const int64_t n_tokens = cur->ne[1];
-
-    ggml_tensor * logits = build_lora_mm(gate_inp, hidden_state); // [n_expert, n_tokens]
-    cb(logits, "ffn_moe_logits", il);
-
-    ggml_tensor * normalized_logits = nullptr;
-    ggml_tensor * probs = nullptr;
-    if (exp_probs_b) {
-        // For Megrez: sigmoid THEN add bias (not the other way around!)
-        normalized_logits = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
-        cb(normalized_logits, "ffn_moe_logits_normalize", il);
-        probs = ggml_add(ctx0, normalized_logits, exp_probs_b); // Add bias AFTER sigmoid
-        cb(probs, "ffn_moe_probs", il);
-    } else {
-        probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens]
-    }
-
-    // select experts
-    ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_expert_used, n_tokens]
-    cb(selected_experts->src[0], "ffn_moe_argsort", il);
-    cb(selected_experts, "ffn_moe_topk", il);
-
-    ggml_tensor * weights = nullptr;
-    if (exp_probs_b) {
-        ggml_tensor * weight0s = ggml_get_rows(ctx0,
-            ggml_reshape_3d(ctx0, normalized_logits, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
-        cb(weight0s, "ffn_moe_weights0", il);
-        weight0s = ggml_reshape_2d(ctx0, weight0s, n_expert_used, n_tokens);
-        ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weight0s); // [1, n_tokens]
-        cb(weights_sum, "ffn_moe_weights0_sum", il);
-        weights = ggml_div(ctx0, weight0s, weights_sum); // [n_expert_used, n_tokens]
-        cb(weights, "ffn_moe_weights_norm", il);
-        weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
-    } else {
-        weights = ggml_get_rows(ctx0,
-            ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
-        cb(weights, "ffn_moe_weights", il);
-    }
-
-    cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
-
-    ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(up, "ffn_moe_up", il);
-
-    ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(gate, "ffn_moe_gate", il);
-
-    gate = ggml_silu(ctx0, gate);
-    cb(gate, "ffn_moe_silu", il);
-
-    ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
-    cb(par, "ffn_moe_gate_par", il);
-
-    ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
-    cb(experts, "ffn_moe_down", il);
-
-    experts = ggml_mul(ctx0, experts, weights);
-    cb(experts, "ffn_moe_weighted", il);
-
-    // aggregate experts
-    ggml_tensor * moe_out = nullptr;
-    for (int i = 0; i < n_expert_used; ++i) {
-        ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
-                experts->nb[2], i*experts->nb[1]);
-
-        if (i == 0) {
-            moe_out = cur_expert;
-        } else {
-            moe_out = ggml_add(ctx0, moe_out, cur_expert);
-        }
-    }
-
-    if (n_expert_used == 1) {
-        // avoid returning a non-contiguous tensor
-        moe_out = ggml_cont(ctx0, moe_out);
-    }
-
-    cb(moe_out, "ffn_moe_out", il);
-
-    return moe_out;
-}
-
 // input embeddings with optional lora
 ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
     const int64_t n_embd = hparams.n_embd;
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -672,19 +672,6 @@ struct llm_graph_context {
                      int   il,
              ggml_tensor * probs_in = nullptr) const;
 
-    // build Megrez MoE FFN (special gating with sigmoid + bias)
-    ggml_tensor * build_mergez_moe_ffn(
-             ggml_tensor * cur,
-             ggml_tensor * hidden_state,
-             ggml_tensor * gate_inp,
-             ggml_tensor * exp_probs_b,
-             ggml_tensor * up_exps,
-             ggml_tensor * gate_exps,
-             ggml_tensor * down_exps,
-                 int64_t   n_expert,
-                 int64_t   n_expert_used,
-                     int   il) const;
-
     //
     // inputs
     //
diff --git a/src/models/megrez-moe.cpp b/src/models/megrez-moe.cpp
@@ -163,14 +163,27 @@ llm_build_megrez_moe::llm_build_megrez_moe(const llama_model & model, const llm_
             cb(cur, "ffn_out", il);
         } else {
             // MoE branch
-            ggml_tensor * moe_out = build_mergez_moe_ffn(cur,
-                        pre_gate_hidden,
-                        model.layers[il].ffn_gate_inp, model.layers[il].ffn_exp_probs_b,
+            // Note: Megrez-MoE uses pre_gate_hidden (from previous layer's FFN norm) for gating
+            // This is different from standard MoE which uses current layer's input
+            // Compute gate logits from pre_gate_hidden instead of cur
+            ggml_tensor * gate_logits = build_lora_mm(model.layers[il].ffn_gate_inp, pre_gate_hidden);
+            cb(gate_logits, "ffn_moe_logits", il);
+            
+            // Use standard build_moe_ffn but with pre-computed gate logits
+            ggml_tensor * moe_out = build_moe_ffn(cur,
+                        model.layers[il].ffn_gate_inp,
                         model.layers[((il - 1) / (3) * (3)) + 1].ffn_up_exps,
                         model.layers[((il - 1) / (3) * (3)) + 1].ffn_gate_exps,
                         model.layers[((il - 1) / (3) * (3)) + 1].ffn_down_exps,
+                        model.layers[il].ffn_exp_probs_b,
                         n_expert, n_expert_used,
-                        il);
+                        LLM_FFN_SILU,
+                        true,  // norm_w
+                        false, // scale_w
+                        1.0f,  // w_scale
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
+                        il,
+                        gate_logits); // Use pre-computed logits from pre_gate_hidden
             cb(moe_out, "ffn_moe_out", il);
 
             pre_gate_hidden = cur;