@@ -1208,15 +1208,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
12081208 default: type = LLM_TYPE_UNKNOWN;
12091209 }
12101210 } break;
1211- case LLM_ARCH_GLM4:
1212- {
1213- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1214- switch (hparams.n_layer) {
1215- case 40: type = LLM_TYPE_9B; break;
1216- case 61: type = LLM_TYPE_32B; break;
1217- default: type = LLM_TYPE_UNKNOWN;
1218- }
1219- } break;
12201211 case LLM_ARCH_BITNET:
12211212 {
12221213 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -3503,45 +3494,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
35033494
35043495 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
35053496
3506- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3507- }
3508- } break;
3509- case LLM_ARCH_GLM4:
3510- {
3511- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3512-
3513- // output
3514- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3515- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3516- // if output is NULL, init from the input tok embed
3517- if (output == NULL) {
3518- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3519- }
3520-
3521- for (int i = 0; i < n_layer; ++i) {
3522- auto & layer = layers[i];
3523-
3524- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3525- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3526- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3527-
3528- if (layer.wqkv == nullptr) {
3529- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3530- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3531- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3532- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3533- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3534- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3535- }
3536-
3537- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3538-
3539- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3540-
3541- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3542- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3543- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
3544-
35453497 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
35463498 }
35473499 } break;
@@ -10977,157 +10929,6 @@ struct llm_build_chatglm : public llm_graph_context {
1097710929 }
1097810930};
1097910931
10980- struct llm_build_glm4 : public llm_graph_context {
10981- llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
10982- const int64_t n_embd_head = hparams.n_embd_head_v;
10983- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
10984-
10985- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10986-
10987- ggml_tensor * cur;
10988- ggml_tensor * inpL;
10989-
10990- inpL = build_inp_embd(model.tok_embd);
10991-
10992- // inp_pos - contains the positions
10993- ggml_tensor * inp_pos = build_inp_pos();
10994-
10995- auto * inp_attn = build_attn_inp_kv_unified();
10996-
10997- for (int il = 0; il < n_layer; ++il) {
10998- ggml_tensor * inpSA = inpL;
10999-
11000- // Pre-attention norm
11001- cur = build_norm(inpL,
11002- model.layers[il].attn_norm,
11003- NULL,
11004- LLM_NORM_RMS, il);
11005- cb(cur, "attn_norm", il);
11006-
11007- // self-attention
11008- {
11009- ggml_tensor * Qcur = nullptr;
11010- ggml_tensor * Kcur = nullptr;
11011- ggml_tensor * Vcur = nullptr;
11012-
11013- if (model.layers[il].wqkv == nullptr) {
11014- Qcur = build_lora_mm(model.layers[il].wq, cur);
11015- if (model.layers[il].bq) {
11016- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
11017- }
11018- Kcur = build_lora_mm(model.layers[il].wk, cur);
11019- if (model.layers[il].bk) {
11020- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
11021- }
11022- Vcur = build_lora_mm(model.layers[il].wv, cur);
11023- if (model.layers[il].bv) {
11024- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
11025- }
11026- } else {
11027- cur = build_lora_mm(model.layers[il].wqkv, cur);
11028- cb(cur, "wqkv", il);
11029- if (model.layers[il].bqkv) {
11030- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
11031- cb(cur, "bqkv", il);
11032- }
11033- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
11034- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
11035- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
11036- }
11037-
11038- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11039- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11040- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11041-
11042- Qcur = ggml_rope_ext(
11043- ctx0, Qcur, inp_pos, nullptr,
11044- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11045- ext_factor, attn_factor, beta_fast, beta_slow
11046- );
11047-
11048- Kcur = ggml_rope_ext(
11049- ctx0, Kcur, inp_pos, nullptr,
11050- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11051- ext_factor, attn_factor, beta_fast, beta_slow
11052- );
11053-
11054- cb(Qcur, "Qcur", il);
11055- cb(Kcur, "Kcur", il);
11056- cb(Vcur, "Vcur", il);
11057-
11058- cur = build_attn(inp_attn, gf,
11059- model.layers[il].wo, NULL,
11060- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11061- }
11062-
11063- if (il == n_layer - 1) {
11064- // skip computing output for unused tokens
11065- ggml_tensor * inp_out_ids = build_inp_out_ids();
11066- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11067- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11068- }
11069-
11070- // Post-attention norm (new!)
11071- cur = build_norm(cur,
11072- model.layers[il].attn_post_norm,
11073- NULL,
11074- LLM_NORM_RMS, il);
11075- cb(cur, "post_attn_norm", il);
11076-
11077- // Add the input (residual connection after post-attention norm)
11078- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11079- cb(ffn_inp, "ffn_inp", il);
11080-
11081- // FF
11082- {
11083- // Pre-MLP norm
11084- cur = build_norm(ffn_inp,
11085- model.layers[il].ffn_norm,
11086- NULL,
11087- LLM_NORM_RMS, il);
11088- cb(cur, "ffn_norm", il);
11089-
11090- // MLP
11091- cur = build_ffn(cur,
11092- model.layers[il].ffn_up, NULL, NULL,
11093- NULL, NULL, NULL,
11094- model.layers[il].ffn_down, NULL, NULL,
11095- NULL,
11096- LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
11097- cb(cur, "ffn_out", il);
11098-
11099- // Post-MLP norm
11100- cur = build_norm(cur,
11101- model.layers[il].ffn_post_norm,
11102- NULL,
11103- LLM_NORM_RMS, il);
11104- cb(cur, "post_mlp_norm", il);
11105- }
11106-
11107- // Add residual connection after post-MLP norm
11108- inpL = ggml_add(ctx0, cur, ffn_inp);
11109- cb(inpL, "l_out", il);
11110- }
11111-
11112- // Final norm
11113- cur = build_norm(inpL,
11114- model.output_norm,
11115- NULL,
11116- LLM_NORM_RMS, -1);
11117-
11118- cb(cur, "result_norm", -1);
11119- res->t_embd = cur;
11120-
11121- // Output projection
11122- cur = build_lora_mm(model.output, cur);
11123-
11124- cb(cur, "result_output", -1);
11125- res->t_logits = cur;
11126-
11127- ggml_build_forward_expand(gf, cur);
11128- }
11129- };
11130-
1113110932struct llm_build_nemotron : public llm_graph_context {
1113210933 llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
1113310934 const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13009,10 +12810,6 @@ llm_graph_result_ptr llama_model::build_graph(
1300912810 {
1301012811 llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
1301112812 } break;
13012- case LLM_ARCH_GLM4:
13013- {
13014- llm = std::make_unique<llm_build_glm4>(*this, params, gf);
13015- } break;
1301612813 case LLM_ARCH_BITNET:
1301712814 {
1301812815 llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
@@ -13210,7 +13007,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1321013007 case LLM_ARCH_DEEPSEEK2:
1321113008 case LLM_ARCH_PLM:
1321213009 case LLM_ARCH_CHATGLM:
13213- case LLM_ARCH_GLM4:
1321413010 case LLM_ARCH_GRANITE:
1321513011 case LLM_ARCH_GRANITE_MOE:
1321613012 case LLM_ARCH_CHAMELEON:
0 commit comments