@@ -1808,6 +1808,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
18081808 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
18091809 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
18101810 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1811+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1812+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1813+ // Ling 2.0 use sigmoid gating func
1814+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1815+ }
18111816
18121817 switch (hparams.n_layer) {
18131818 case 20: type = LLM_TYPE_16B; break;
@@ -16634,6 +16639,9 @@ struct llm_build_bailingmoe : public llm_graph_context {
1663416639
1663516640struct llm_build_bailingmoe_v2 : public llm_graph_context {
1663616641 llm_build_bailingmoe_v2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
16642+ const int64_t n_embd_head = hparams.n_embd_head_v;
16643+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
16644+
1663716645 ggml_tensor * cur;
1663816646 ggml_tensor * inpL;
1663916647
@@ -16682,9 +16690,9 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
1668216690 cb(Vcur, "Vcur", il);
1668316691 }
1668416692
16685- Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot , n_head, n_tokens);
16686- Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot , n_head_kv, n_tokens);
16687- Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot , n_head_kv, n_tokens);
16693+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head , n_head, n_tokens);
16694+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head , n_head_kv, n_tokens);
16695+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head , n_head_kv, n_tokens);
1668816696
1668916697 Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
1669016698 cb(Qcur, "Qcur_normed", il);
@@ -16710,7 +16718,7 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
1671016718
1671116719 cur = build_attn(inp_attn,
1671216720 model.layers[il].wo, model.layers[il].bo,
16713- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot )), il);
16721+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head )), il);
1671416722 }
1671516723
1671616724 if (il == n_layer - 1 && inp_out_ids) {
@@ -16763,15 +16771,15 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
1676316771 cur = ggml_add(ctx0, moe_out, ffn_shexp);
1676416772 cb(cur, "ffn_out", il);
1676516773 }
16774+ }
1676616775
16767- cur = ggml_add(ctx0, cur, ffn_inp);
16776+ cur = ggml_add(ctx0, cur, ffn_inp);
1676816777
16769- cur = build_cvec(cur, il);
16770- cb(cur, "l_out", il);
16778+ cur = build_cvec(cur, il);
16779+ cb(cur, "l_out", il);
1677116780
16772- // input for next layer
16773- inpL = cur;
16774- }
16781+ // input for next layer
16782+ inpL = cur;
1677516783 }
1677616784
1677716785 cur = inpL;
0 commit comments