@@ -4459,6 +4459,59 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
44594459 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
44604460 }
44614461 } break;
4462+ case LLM_ARCH_GLM4_MOE:
4463+ {
4464+ const auto tn = LLM_TN(arch);
4465+
4466+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4467+
4468+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4469+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4470+
4471+ for (int i = 0; i < n_layer; ++i) {
4472+ auto & layer = layers[i];
4473+
4474+ // self-attention
4475+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4476+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4477+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4478+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4479+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4480+
4481+ // optional QK norms
4482+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
4483+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
4484+
4485+ // pre-FFN norm
4486+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4487+
4488+ if (static_cast<uint32_t>(i) < hparams.n_layer_dense_lead) {
4489+ // this layer uses a dense FFN block
4490+ const int64_t n_ff_dense = hparams.n_ff(i);
4491+
4492+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_dense}, 0);
4493+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_dense, n_embd}, 0);
4494+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_dense}, 0);
4495+ } else {
4496+ // this layer uses a MoE FFN block (1 group of conditional experts + 1 shared expert)
4497+ const int64_t n_ff_exp = hparams.n_ff_exp;
4498+
4499+ // router input and expert biases
4500+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4501+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
4502+
4503+ // conditional experts
4504+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
4505+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4506+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
4507+
4508+ // shared expert
4509+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp}, 0);
4510+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_exp, n_embd}, 0);
4511+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp}, 0);
4512+ }
4513+ }
4514+ } break;
44624515 case LLM_ARCH_NEMOTRON:
44634516 {
44644517 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -13599,7 +13652,7 @@ struct llm_build_glm4 : public llm_graph_context {
1359913652struct llm_build_glm4_moe : public llm_graph_context {
1360013653 llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
1360113654 // TODO
13602- };
13655+ }
1360313656};
1360413657
1360513658struct llm_build_nemotron : public llm_graph_context {
0 commit comments