Skip to content

Commit 6f3d94e

Browse files
committed
feat: support GLM 4.5 family of models
1 parent ab3183e commit 6f3d94e

File tree

1 file changed

+22
-24
lines changed

1 file changed

+22
-24
lines changed

src/llama-model.cpp

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -13580,36 +13580,34 @@ struct llm_build_glm4_moe : public llm_graph_context {
1358013580
const int64_t n_expert = hparams.n_expert;
1358113581
const int64_t n_expert_used = hparams.n_expert_used;
1358213582

13583-
// Compute shared expert output first
13584-
ggml_tensor * cur_shexp = build_ffn(cur,
13583+
// Save original input for shared expert
13584+
ggml_tensor * residuals = cur;
13585+
13586+
// Process routed experts using existing MoE infrastructure
13587+
ggml_tensor * routed_out = build_moe_ffn(cur,
13588+
model.layers[il].ffn_gate_inp,
13589+
model.layers[il].ffn_up_exps,
13590+
model.layers[il].ffn_gate_exps,
13591+
model.layers[il].ffn_down_exps,
13592+
model.layers[il].ffn_exp_probs_b,
13593+
n_expert, n_expert_used,
13594+
LLM_FFN_SILU, hparams.expert_weights_norm,
13595+
true, hparams.expert_weights_scale,
13596+
(llama_expert_gating_func_type) hparams.expert_gating_func,
13597+
il);
13598+
cb(routed_out, "ffn_moe_out", il);
13599+
13600+
// Process shared expert on original input
13601+
ggml_tensor * shared_out = build_ffn(residuals,
1358513602
model.layers[il].ffn_up_shexp, NULL, NULL,
1358613603
model.layers[il].ffn_gate_shexp, NULL, NULL,
1358713604
model.layers[il].ffn_down_shexp, NULL, NULL,
1358813605
NULL,
1358913606
LLM_FFN_SILU, LLM_FFN_PAR, il);
13590-
cb(cur_shexp, "ffn_shexp_out", il);
13591-
13592-
ggml_tensor * moe_out =
13593-
build_moe_ffn(cur,
13594-
model.layers[il].ffn_gate_inp,
13595-
model.layers[il].ffn_up_exps,
13596-
model.layers[il].ffn_gate_exps,
13597-
model.layers[il].ffn_down_exps,
13598-
model.layers[il].ffn_exp_probs_b,
13599-
n_expert, n_expert_used,
13600-
LLM_FFN_SILU, hparams.expert_weights_norm,
13601-
true, hparams.expert_weights_scale,
13602-
(llama_expert_gating_func_type) hparams.expert_gating_func,
13603-
il);
13604-
cb(moe_out, "ffn_moe_out", il);
13605-
13606-
// For GLM4_MOE: Shared expert is always active alongside routed experts
13607-
// Apply proper scaling to shared expert to match architectural design
13608-
cur_shexp = ggml_scale(ctx0, cur_shexp, hparams.expert_weights_scale);
13609-
cb(cur_shexp, "ffn_shexp_scaled", il);
13607+
cb(shared_out, "ffn_shexp_out", il);
1361013608

13611-
// Combine with proper mathematical balance
13612-
cur = ggml_add(ctx0, moe_out, cur_shexp);
13609+
// Final output: routed_output + shared_output
13610+
cur = ggml_add(ctx0, routed_out, shared_out);
1361313611
cb(cur, "ffn_out", il);
1361413612
}
1361513613

0 commit comments

Comments
 (0)