[fix] fix llm graph for ling mini 2.0

im0qianqian · im0qianqian · commit a2a2299ce7ef · 2025-09-14T03:25:20.000+08:00
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1808,6 +1808,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
+                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+                    // Ling 2.0 use sigmoid gating func
+                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+                }
 
                 switch (hparams.n_layer) {
                     case 20: type = LLM_TYPE_16B; break;
@@ -16634,6 +16639,9 @@ struct llm_build_bailingmoe : public llm_graph_context {
 
 struct llm_build_bailingmoe_v2 : public llm_graph_context {
     llm_build_bailingmoe_v2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
         ggml_tensor * cur;
         ggml_tensor * inpL;
 
@@ -16682,9 +16690,9 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
                     cb(Vcur, "Vcur", il);
                 }
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
                 Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
                 cb(Qcur, "Qcur_normed", il);
@@ -16710,7 +16718,7 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
 
                 cur = build_attn(inp_attn,
                         model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1 && inp_out_ids) {
@@ -16763,15 +16771,15 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
                     cur = ggml_add(ctx0, moe_out, ffn_shexp);
                     cb(cur, "ffn_out", il);
                 }
+            }
 
-                cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = ggml_add(ctx0, cur, ffn_inp);
 
-                cur = build_cvec(cur, il);
-                cb(cur, "l_out", il);
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
 
-                // input for next layer
-                inpL = cur;
-            }
+            // input for next layer
+            inpL = cur;
         }
 
         cur = inpL;