new build_moe_ffn_from_probs, and can run 4b

wdl339 · wdl339 · commit 8e2cb21fb2eb · 2025-07-24T12:49:51.000Z
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -947,14 +947,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn_from_probs(
          ggml_tensor * exp_probs_b,
              int64_t   n_expert,
              int64_t   n_expert_used,
-     llm_ffn_op_type   type_op,
-                bool   norm_w,
-                bool   scale_w,
-               float   w_scale,
+             llama_expert_gating_func_type gating_op,
                  int   il) const {
     const int64_t n_embd   = cur->ne[0];
     const int64_t n_tokens = cur->ne[1];
-    const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
 
     // add experts selection bias - introduced in DeepSeek V3
     // leave probs unbiased as it's later used to get expert weights
@@ -973,90 +969,57 @@ ggml_tensor * llm_graph_context::build_moe_ffn_from_probs(
             ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
     cb(weights, "ffn_moe_weights", il);
 
-    if (norm_w) {
-        weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
-
+    weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
+     if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
+        weights = ggml_soft_max(ctx0, weights);
+    } else {
+        weights = ggml_sigmoid(ctx0, weights);
         ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
         cb(weights_sum, "ffn_moe_weights_sum", il);
 
         weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
         cb(weights, "ffn_moe_weights_norm", il);
-
-        weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
-    }
-    if (scale_w) {
-        weights = ggml_scale(ctx0, weights, w_scale);
-        cb(weights, "ffn_moe_weights_scaled", il);
     }
 
-    cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+    weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
 
-    if (weight_before_ffn) {
-        // repeat cur to [n_embd, n_expert_used, n_tokens]
-        ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_used, n_tokens, 1);
-        cur = ggml_mul(ctx0, repeated, weights);
-        cb(cur, "ffn_moe_weighted", il);
-    }
+    cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
 
     ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
     cb(up, "ffn_moe_up", il);
 
     ggml_tensor * experts = nullptr;
-    if (gate_exps) {
-        cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-        cb(cur, "ffn_moe_gate", il);
-    } else {
-        cur = up;
-    }
+    cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+    cb(cur, "ffn_moe_gate", il);
 
-    switch (type_op) {
-        case LLM_FFN_SILU:
-            if (gate_exps) {
-                cur = ggml_swiglu_split(ctx0, cur, up);
-                cb(cur, "ffn_moe_swiglu", il);
-            } else {
-                cur = ggml_silu(ctx0, cur);
-                cb(cur, "ffn_moe_silu", il);
-            } break;
-        case LLM_FFN_GELU:
-            if (gate_exps) {
-                cur = ggml_geglu_split(ctx0, cur, up);
-                cb(cur, "ffn_moe_geglu", il);
-            } else {
-                cur = ggml_gelu(ctx0, cur);
-                cb(cur, "ffn_moe_gelu", il);
-            } break;
-        case LLM_FFN_RELU:
-            if (gate_exps) {
-                cur = ggml_reglu_split(ctx0, cur, up);
-                cb(cur, "ffn_moe_reglu", il);
-            } else {
-                cur = ggml_relu(ctx0, cur);
-                cb(cur, "ffn_moe_relu", il);
-            } break;
-        default:
-            GGML_ABORT("fatal error");
-    }
+    cur = ggml_reglu_split(ctx0, cur, up);
+    cb(cur, "ffn_moe_reglu", il);
 
     experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
     cb(experts, "ffn_moe_down", il);
 
-    if (!weight_before_ffn) {
-        experts = ggml_mul(ctx0, experts, weights);
-        cb(cur, "ffn_moe_weighted", il);
+    experts = ggml_mul(ctx0, experts, weights);
+    cb(cur, "ffn_moe_weighted", il);
+
+    ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
+
+    assert(n_expert_used > 0);
+
+    // order the views before the adds
+    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
+        cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
+
+        ggml_build_forward_expand(gf, cur_experts[i]);
     }
 
     // aggregate experts
-    ggml_tensor * moe_out = nullptr;
-    for (int i = 0; i < n_expert_used; ++i) {
-        ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
-                experts->nb[2], i*experts->nb[1]);
+    // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
+    //       to avoid potentially a large number of add nodes during warmup
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
+    ggml_tensor * moe_out = cur_experts[0];
 
-        if (i == 0) {
-            moe_out = cur_expert;
-        } else {
-            moe_out = ggml_add(ctx0, moe_out, cur_expert);
-        }
+    for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
+        moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
     }
 
     if (n_expert_used == 1) {
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -634,10 +634,7 @@ struct llm_graph_context {
              ggml_tensor * exp_probs_b,
                  int64_t   n_expert,
                  int64_t   n_expert_used,
-         llm_ffn_op_type   type_op,
-                    bool   norm_w,
-                    bool   scale_w,
-                   float   w_scale,
+            llama_expert_gating_func_type gating_op,
                      int   il) const;
 
     //
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -5191,7 +5191,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                     // output
                     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto &        layer    = layers[i];
@@ -17078,7 +17083,7 @@ struct llm_build_lfm2 : public llm_graph_context {
 };
 
 struct llm_build_smallthinker : public llm_graph_context{
-    llm_build_smallthinker(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params){
+    llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
         const int64_t n_embd_head = hparams.n_embd_head_v;
 
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -17105,15 +17110,8 @@ struct llm_build_smallthinker : public llm_graph_context{
             bool is_moe = hparams.n_ff_exp == hparams.n_ff_arr[il];
 
             if (is_moe) {
-                ggml_tensor * logits = build_lora_mm(model.layers[il].ffn_gate_inp, inpL);  // [n_expert, n_tokens]
-                cb(logits, "ffn_moe_logits", il);
-
-                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
-                    probs = ggml_soft_max(ctx0, logits);  // [n_expert, n_tokens]
-                } else {
-                    probs = ggml_sigmoid(ctx0, logits);  // [n_expert, n_tokens]
-                }
-                cb(probs, "ffn_moe_probs", il);
+                probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL);  // [n_expert, n_tokens]
+                cb(probs, "ffn_moe_logits", il);
             }
 
             // norm
@@ -17148,10 +17146,10 @@ struct llm_build_smallthinker : public llm_graph_context{
                 cb(Kcur, "Kcur", il);
 
                 if (hparams.is_swa_any()) {
-                    cur = build_attn(static_cast<llm_graph_input_attn_kv_unified_iswa *>(inp_attn), gf, model.layers[il].wo, model.layers[il].bo, Qcur,Kcur, Vcur,
+                    cur = build_attn(static_cast<llm_graph_input_attn_kv_unified_iswa *>(inp_attn), model.layers[il].wo, model.layers[il].bo, Qcur,Kcur, Vcur,
                        nullptr,nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
                 } else {
-                    cur = build_attn(static_cast<llm_graph_input_attn_kv_unified *>(inp_attn), gf, model.layers[il].wo, model.layers[il].bo, Qcur,Kcur, Vcur,
+                    cur = build_attn(static_cast<llm_graph_input_attn_kv_unified *>(inp_attn), model.layers[il].wo, model.layers[il].bo, Qcur,Kcur, Vcur,
                         nullptr,nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
                 }
             }
@@ -17175,8 +17173,8 @@ struct llm_build_smallthinker : public llm_graph_context{
             if (is_moe) {
                 ffn_out = build_moe_ffn_from_probs(cur, probs, model.layers[il].ffn_up_exps,
                                                 model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                                                nullptr, n_expert, n_expert_used, LLM_FFN_RELU, true, false, 0.0, il);
-
+                                                nullptr, n_expert, n_expert_used,
+                                                static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
             } else {
                 ffn_out = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
                                     model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_RELU, LLM_FFN_PAR, il);
@@ -17647,7 +17645,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             } break;
         case LLM_ARCH_SMALLTHINKER:
             {
-                llm = std::make_unique<llm_build_smallthinker>(*this, params, gf);
+                llm = std::make_unique<llm_build_smallthinker>(*this, params);
             } break;
         default:
             GGML_ABORT("fatal error");