Skip to content

Commit a2a2299

Browse files
committed
[fix] fix llm graph for ling mini 2.0
1 parent 94ec7dc commit a2a2299

File tree

1 file changed

+18
-10
lines changed

1 file changed

+18
-10
lines changed

src/llama-model.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1808,6 +1808,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
18081808
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
18091809
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
18101810
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1811+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1812+
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1813+
// Ling 2.0 use sigmoid gating func
1814+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1815+
}
18111816

18121817
switch (hparams.n_layer) {
18131818
case 20: type = LLM_TYPE_16B; break;
@@ -16634,6 +16639,9 @@ struct llm_build_bailingmoe : public llm_graph_context {
1663416639

1663516640
struct llm_build_bailingmoe_v2 : public llm_graph_context {
1663616641
llm_build_bailingmoe_v2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
16642+
const int64_t n_embd_head = hparams.n_embd_head_v;
16643+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
16644+
1663716645
ggml_tensor * cur;
1663816646
ggml_tensor * inpL;
1663916647

@@ -16682,9 +16690,9 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
1668216690
cb(Vcur, "Vcur", il);
1668316691
}
1668416692

16685-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
16686-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
16687-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
16693+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
16694+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
16695+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1668816696

1668916697
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
1669016698
cb(Qcur, "Qcur_normed", il);
@@ -16710,7 +16718,7 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
1671016718

1671116719
cur = build_attn(inp_attn,
1671216720
model.layers[il].wo, model.layers[il].bo,
16713-
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
16721+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
1671416722
}
1671516723

1671616724
if (il == n_layer - 1 && inp_out_ids) {
@@ -16763,15 +16771,15 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
1676316771
cur = ggml_add(ctx0, moe_out, ffn_shexp);
1676416772
cb(cur, "ffn_out", il);
1676516773
}
16774+
}
1676616775

16767-
cur = ggml_add(ctx0, cur, ffn_inp);
16776+
cur = ggml_add(ctx0, cur, ffn_inp);
1676816777

16769-
cur = build_cvec(cur, il);
16770-
cb(cur, "l_out", il);
16778+
cur = build_cvec(cur, il);
16779+
cb(cur, "l_out", il);
1677116780

16772-
// input for next layer
16773-
inpL = cur;
16774-
}
16781+
// input for next layer
16782+
inpL = cur;
1677516783
}
1677616784

1677716785
cur = inpL;

0 commit comments

Comments
 (0)