@@ -13580,36 +13580,34 @@ struct llm_build_glm4_moe : public llm_graph_context {
1358013580 const int64_t n_expert = hparams.n_expert;
1358113581 const int64_t n_expert_used = hparams.n_expert_used;
1358213582
13583- // Compute shared expert output first
13584- ggml_tensor * cur_shexp = build_ffn(cur,
13583+ // Save original input for shared expert
13584+ ggml_tensor * residuals = cur;
13585+
13586+ // Process routed experts using existing MoE infrastructure
13587+ ggml_tensor * routed_out = build_moe_ffn(cur,
13588+ model.layers[il].ffn_gate_inp,
13589+ model.layers[il].ffn_up_exps,
13590+ model.layers[il].ffn_gate_exps,
13591+ model.layers[il].ffn_down_exps,
13592+ model.layers[il].ffn_exp_probs_b,
13593+ n_expert, n_expert_used,
13594+ LLM_FFN_SILU, hparams.expert_weights_norm,
13595+ true, hparams.expert_weights_scale,
13596+ (llama_expert_gating_func_type) hparams.expert_gating_func,
13597+ il);
13598+ cb(routed_out, "ffn_moe_out", il);
13599+
13600+ // Process shared expert on original input
13601+ ggml_tensor * shared_out = build_ffn(residuals,
1358513602 model.layers[il].ffn_up_shexp, NULL, NULL,
1358613603 model.layers[il].ffn_gate_shexp, NULL, NULL,
1358713604 model.layers[il].ffn_down_shexp, NULL, NULL,
1358813605 NULL,
1358913606 LLM_FFN_SILU, LLM_FFN_PAR, il);
13590- cb(cur_shexp, "ffn_shexp_out", il);
13591-
13592- ggml_tensor * moe_out =
13593- build_moe_ffn(cur,
13594- model.layers[il].ffn_gate_inp,
13595- model.layers[il].ffn_up_exps,
13596- model.layers[il].ffn_gate_exps,
13597- model.layers[il].ffn_down_exps,
13598- model.layers[il].ffn_exp_probs_b,
13599- n_expert, n_expert_used,
13600- LLM_FFN_SILU, hparams.expert_weights_norm,
13601- true, hparams.expert_weights_scale,
13602- (llama_expert_gating_func_type) hparams.expert_gating_func,
13603- il);
13604- cb(moe_out, "ffn_moe_out", il);
13605-
13606- // For GLM4_MOE: Shared expert is always active alongside routed experts
13607- // Apply proper scaling to shared expert to match architectural design
13608- cur_shexp = ggml_scale(ctx0, cur_shexp, hparams.expert_weights_scale);
13609- cb(cur_shexp, "ffn_shexp_scaled", il);
13607+ cb(shared_out, "ffn_shexp_out", il);
1361013608
13611- // Combine with proper mathematical balance
13612- cur = ggml_add(ctx0, moe_out, cur_shexp );
13609+ // Final output: routed_output + shared_output
13610+ cur = ggml_add(ctx0, routed_out, shared_out );
1361313611 cb(cur, "ffn_out", il);
1361413612 }
1361513613
0 commit comments