@@ -13557,16 +13557,13 @@ struct llm_build_glm4_moe : public llm_graph_context {
1355713557 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1355813558 }
1355913559
13560- // Post-attention norm
13561- cur = build_norm(cur,
13562- model.layers[il].attn_post_norm,
13563- NULL,
13564- LLM_NORM_RMS, il);
13565- cb(cur, "post_attn_norm", il);
13566-
1356713560 ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
1356813561 cb(ffn_inp, "ffn_inp", il);
1356913562
13563+ // Post-attention norm
13564+ cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
13565+ cb(cur, "post_attn_norm", il);
13566+
1357013567 // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
1357113568 if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
1357213569 // Dense FFN layer
@@ -13582,9 +13579,6 @@ struct llm_build_glm4_moe : public llm_graph_context {
1358213579 const int64_t n_expert = hparams.n_expert;
1358313580 const int64_t n_expert_used = hparams.n_expert_used;
1358413581
13585- // Save original input for shared expert
13586- ggml_tensor * residuals = cur;
13587-
1358813582 // Process routed experts using existing MoE infrastructure
1358913583 ggml_tensor * routed_out = build_moe_ffn(cur,
1359013584 model.layers[il].ffn_gate_inp,
@@ -13600,7 +13594,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
1360013594 cb(routed_out, "ffn_moe_out", il);
1360113595
1360213596 // Process shared expert on original input
13603- ggml_tensor * shared_out = build_ffn(residuals ,
13597+ ggml_tensor * shared_out = build_ffn(cur ,
1360413598 model.layers[il].ffn_up_shexp, NULL, NULL,
1360513599 model.layers[il].ffn_gate_shexp, NULL, NULL,
1360613600 model.layers[il].ffn_down_shexp, NULL, NULL,
0 commit comments