@@ -13580,6 +13580,15 @@ struct llm_build_glm4_moe : public llm_graph_context {
1358013580 const int64_t n_expert = hparams.n_expert;
1358113581 const int64_t n_expert_used = hparams.n_expert_used;
1358213582
13583+ // Compute shared expert output first
13584+ ggml_tensor * cur_shexp = build_ffn(cur,
13585+ model.layers[il].ffn_up_shexp, NULL, NULL,
13586+ model.layers[il].ffn_gate_shexp, NULL, NULL,
13587+ model.layers[il].ffn_down_shexp, NULL, NULL,
13588+ NULL,
13589+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13590+ cb(cur_shexp, "ffn_shexp_out", il);
13591+
1358313592 ggml_tensor * moe_out =
1358413593 build_moe_ffn(cur,
1358513594 model.layers[il].ffn_gate_inp,
@@ -13594,16 +13603,12 @@ struct llm_build_glm4_moe : public llm_graph_context {
1359413603 il);
1359513604 cb(moe_out, "ffn_moe_out", il);
1359613605
13597- // Add shared expert computation
13598- ggml_tensor * cur_shexp = build_ffn(cur,
13599- model.layers[il].ffn_up_shexp, NULL, NULL,
13600- model.layers[il].ffn_gate_shexp, NULL, NULL,
13601- model.layers[il].ffn_down_shexp, NULL, NULL,
13602- NULL,
13603- LLM_FFN_SILU, LLM_FFN_PAR, il);
13604- cb(cur_shexp, "ffn_shexp_out", il);
13606+ // For GLM4_MOE: Shared expert is always active alongside routed experts
13607+ // Apply proper scaling to shared expert to match architectural design
13608+ cur_shexp = ggml_scale(ctx0, cur_shexp, hparams.expert_weights_scale);
13609+ cb(cur_shexp, "ffn_shexp_scaled", il);
1360513610
13606- // Combine MoE output with shared expert output
13611+ // Combine with proper mathematical balance
1360713612 cur = ggml_add(ctx0, moe_out, cur_shexp);
1360813613 cb(cur, "ffn_out", il);
1360913614 }
0 commit comments