Skip to content

Commit 1dd934b

Browse files
committed
remove glm4 arch
1 parent b928f8c commit 1dd934b

File tree

6 files changed

+2
-249
lines changed

6 files changed

+2
-249
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -669,7 +669,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
669669
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
670670
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
671671
res = "jina-v2-code"
672-
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
672+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" or chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
673+
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
673674
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
674675
res = "chatglm-bpe"
675676
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
@@ -735,9 +736,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
735736
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
736737
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
737738
res = "llama4"
738-
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
739-
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
740-
res = "glm4"
741739

742740
if res is None:
743741
logger.warning("\n")

gguf-py/gguf/constants.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,6 @@ class MODEL_ARCH(IntEnum):
282282
DEEPSEEK = auto()
283283
DEEPSEEK2 = auto()
284284
CHATGLM = auto()
285-
GLM4 = auto()
286285
BITNET = auto()
287286
T5 = auto()
288287
T5ENCODER = auto()
@@ -492,7 +491,6 @@ class MODEL_TENSOR(IntEnum):
492491
MODEL_ARCH.DEEPSEEK: "deepseek",
493492
MODEL_ARCH.DEEPSEEK2: "deepseek2",
494493
MODEL_ARCH.CHATGLM: "chatglm",
495-
MODEL_ARCH.GLM4: "glm4",
496494
MODEL_ARCH.BITNET: "bitnet",
497495
MODEL_ARCH.T5: "t5",
498496
MODEL_ARCH.T5ENCODER: "t5encoder",
@@ -1573,23 +1571,6 @@ class MODEL_TENSOR(IntEnum):
15731571
MODEL_TENSOR.ATTN_POST_NORM,
15741572
MODEL_TENSOR.FFN_POST_NORM,
15751573
],
1576-
MODEL_ARCH.GLM4 : [
1577-
MODEL_TENSOR.TOKEN_EMBD,
1578-
MODEL_TENSOR.ROPE_FREQS,
1579-
MODEL_TENSOR.OUTPUT_NORM,
1580-
MODEL_TENSOR.OUTPUT,
1581-
MODEL_TENSOR.ATTN_NORM,
1582-
MODEL_TENSOR.ATTN_QKV,
1583-
MODEL_TENSOR.ATTN_Q,
1584-
MODEL_TENSOR.ATTN_K,
1585-
MODEL_TENSOR.ATTN_V,
1586-
MODEL_TENSOR.ATTN_OUT,
1587-
MODEL_TENSOR.FFN_NORM,
1588-
MODEL_TENSOR.FFN_DOWN,
1589-
MODEL_TENSOR.FFN_UP,
1590-
MODEL_TENSOR.ATTN_POST_NORM,
1591-
MODEL_TENSOR.FFN_POST_NORM,
1592-
],
15931574
MODEL_ARCH.BITNET: [
15941575
MODEL_TENSOR.ATTN_Q,
15951576
MODEL_TENSOR.ATTN_K,

src/llama-arch.cpp

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
5454
{ LLM_ARCH_DEEPSEEK, "deepseek" },
5555
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
5656
{ LLM_ARCH_CHATGLM, "chatglm" },
57-
{ LLM_ARCH_GLM4, "glm4" },
5857
{ LLM_ARCH_BITNET, "bitnet" },
5958
{ LLM_ARCH_T5, "t5" },
6059
{ LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -1159,25 +1158,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
11591158
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
11601159
},
11611160
},
1162-
{
1163-
LLM_ARCH_GLM4,
1164-
{
1165-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1166-
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1167-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1168-
{ LLM_TENSOR_OUTPUT, "output" },
1169-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1170-
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1171-
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1172-
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1173-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1174-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1175-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1176-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1177-
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1178-
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1179-
},
1180-
},
11811161
{
11821162
LLM_ARCH_BITNET,
11831163
{

src/llama-arch.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ enum llm_arch {
5858
LLM_ARCH_DEEPSEEK,
5959
LLM_ARCH_DEEPSEEK2,
6060
LLM_ARCH_CHATGLM,
61-
LLM_ARCH_GLM4,
6261
LLM_ARCH_BITNET,
6362
LLM_ARCH_T5,
6463
LLM_ARCH_T5ENCODER,

src/llama-model.cpp

Lines changed: 0 additions & 204 deletions
Original file line numberDiff line numberDiff line change
@@ -1208,15 +1208,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
12081208
default: type = LLM_TYPE_UNKNOWN;
12091209
}
12101210
} break;
1211-
case LLM_ARCH_GLM4:
1212-
{
1213-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1214-
switch (hparams.n_layer) {
1215-
case 40: type = LLM_TYPE_9B; break;
1216-
case 61: type = LLM_TYPE_32B; break;
1217-
default: type = LLM_TYPE_UNKNOWN;
1218-
}
1219-
} break;
12201211
case LLM_ARCH_BITNET:
12211212
{
12221213
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -3503,45 +3494,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
35033494

35043495
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
35053496

3506-
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3507-
}
3508-
} break;
3509-
case LLM_ARCH_GLM4:
3510-
{
3511-
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3512-
3513-
// output
3514-
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3515-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3516-
// if output is NULL, init from the input tok embed
3517-
if (output == NULL) {
3518-
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3519-
}
3520-
3521-
for (int i = 0; i < n_layer; ++i) {
3522-
auto & layer = layers[i];
3523-
3524-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3525-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3526-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3527-
3528-
if (layer.wqkv == nullptr) {
3529-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3530-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3531-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3532-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3533-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3534-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3535-
}
3536-
3537-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3538-
3539-
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3540-
3541-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3542-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3543-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
3544-
35453497
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
35463498
}
35473499
} break;
@@ -10977,157 +10929,6 @@ struct llm_build_chatglm : public llm_graph_context {
1097710929
}
1097810930
};
1097910931

10980-
struct llm_build_glm4 : public llm_graph_context {
10981-
llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
10982-
const int64_t n_embd_head = hparams.n_embd_head_v;
10983-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
10984-
10985-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10986-
10987-
ggml_tensor * cur;
10988-
ggml_tensor * inpL;
10989-
10990-
inpL = build_inp_embd(model.tok_embd);
10991-
10992-
// inp_pos - contains the positions
10993-
ggml_tensor * inp_pos = build_inp_pos();
10994-
10995-
auto * inp_attn = build_attn_inp_kv_unified();
10996-
10997-
for (int il = 0; il < n_layer; ++il) {
10998-
ggml_tensor * inpSA = inpL;
10999-
11000-
// Pre-attention norm
11001-
cur = build_norm(inpL,
11002-
model.layers[il].attn_norm,
11003-
NULL,
11004-
LLM_NORM_RMS, il);
11005-
cb(cur, "attn_norm", il);
11006-
11007-
// self-attention
11008-
{
11009-
ggml_tensor * Qcur = nullptr;
11010-
ggml_tensor * Kcur = nullptr;
11011-
ggml_tensor * Vcur = nullptr;
11012-
11013-
if (model.layers[il].wqkv == nullptr) {
11014-
Qcur = build_lora_mm(model.layers[il].wq, cur);
11015-
if (model.layers[il].bq) {
11016-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
11017-
}
11018-
Kcur = build_lora_mm(model.layers[il].wk, cur);
11019-
if (model.layers[il].bk) {
11020-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
11021-
}
11022-
Vcur = build_lora_mm(model.layers[il].wv, cur);
11023-
if (model.layers[il].bv) {
11024-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
11025-
}
11026-
} else {
11027-
cur = build_lora_mm(model.layers[il].wqkv, cur);
11028-
cb(cur, "wqkv", il);
11029-
if (model.layers[il].bqkv) {
11030-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
11031-
cb(cur, "bqkv", il);
11032-
}
11033-
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
11034-
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
11035-
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
11036-
}
11037-
11038-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11039-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11040-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11041-
11042-
Qcur = ggml_rope_ext(
11043-
ctx0, Qcur, inp_pos, nullptr,
11044-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11045-
ext_factor, attn_factor, beta_fast, beta_slow
11046-
);
11047-
11048-
Kcur = ggml_rope_ext(
11049-
ctx0, Kcur, inp_pos, nullptr,
11050-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11051-
ext_factor, attn_factor, beta_fast, beta_slow
11052-
);
11053-
11054-
cb(Qcur, "Qcur", il);
11055-
cb(Kcur, "Kcur", il);
11056-
cb(Vcur, "Vcur", il);
11057-
11058-
cur = build_attn(inp_attn, gf,
11059-
model.layers[il].wo, NULL,
11060-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11061-
}
11062-
11063-
if (il == n_layer - 1) {
11064-
// skip computing output for unused tokens
11065-
ggml_tensor * inp_out_ids = build_inp_out_ids();
11066-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11067-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11068-
}
11069-
11070-
// Post-attention norm (new!)
11071-
cur = build_norm(cur,
11072-
model.layers[il].attn_post_norm,
11073-
NULL,
11074-
LLM_NORM_RMS, il);
11075-
cb(cur, "post_attn_norm", il);
11076-
11077-
// Add the input (residual connection after post-attention norm)
11078-
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11079-
cb(ffn_inp, "ffn_inp", il);
11080-
11081-
// FF
11082-
{
11083-
// Pre-MLP norm
11084-
cur = build_norm(ffn_inp,
11085-
model.layers[il].ffn_norm,
11086-
NULL,
11087-
LLM_NORM_RMS, il);
11088-
cb(cur, "ffn_norm", il);
11089-
11090-
// MLP
11091-
cur = build_ffn(cur,
11092-
model.layers[il].ffn_up, NULL, NULL,
11093-
NULL, NULL, NULL,
11094-
model.layers[il].ffn_down, NULL, NULL,
11095-
NULL,
11096-
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
11097-
cb(cur, "ffn_out", il);
11098-
11099-
// Post-MLP norm
11100-
cur = build_norm(cur,
11101-
model.layers[il].ffn_post_norm,
11102-
NULL,
11103-
LLM_NORM_RMS, il);
11104-
cb(cur, "post_mlp_norm", il);
11105-
}
11106-
11107-
// Add residual connection after post-MLP norm
11108-
inpL = ggml_add(ctx0, cur, ffn_inp);
11109-
cb(inpL, "l_out", il);
11110-
}
11111-
11112-
// Final norm
11113-
cur = build_norm(inpL,
11114-
model.output_norm,
11115-
NULL,
11116-
LLM_NORM_RMS, -1);
11117-
11118-
cb(cur, "result_norm", -1);
11119-
res->t_embd = cur;
11120-
11121-
// Output projection
11122-
cur = build_lora_mm(model.output, cur);
11123-
11124-
cb(cur, "result_output", -1);
11125-
res->t_logits = cur;
11126-
11127-
ggml_build_forward_expand(gf, cur);
11128-
}
11129-
};
11130-
1113110932
struct llm_build_nemotron : public llm_graph_context {
1113210933
llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
1113310934
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13009,10 +12810,6 @@ llm_graph_result_ptr llama_model::build_graph(
1300912810
{
1301012811
llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
1301112812
} break;
13012-
case LLM_ARCH_GLM4:
13013-
{
13014-
llm = std::make_unique<llm_build_glm4>(*this, params, gf);
13015-
} break;
1301612813
case LLM_ARCH_BITNET:
1301712814
{
1301812815
llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
@@ -13210,7 +13007,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1321013007
case LLM_ARCH_DEEPSEEK2:
1321113008
case LLM_ARCH_PLM:
1321213009
case LLM_ARCH_CHATGLM:
13213-
case LLM_ARCH_GLM4:
1321413010
case LLM_ARCH_GRANITE:
1321513011
case LLM_ARCH_GRANITE_MOE:
1321613012
case LLM_ARCH_CHAMELEON:

src/llama-vocab.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1572,7 +1572,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
15721572
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
15731573
clean_spaces = false;
15741574
} else if (
1575-
tokenizer_pre == "glm4" ||
15761575
tokenizer_pre == "chatglm-bpe") {
15771576
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
15781577
special_bos_id = LLAMA_TOKEN_NULL;

0 commit comments

Comments
 (0)