Skip to content

Commit 47c3dc7

Browse files
authored
Add support for GLM-4.5 models (ikawrakow#668)
* GLM-4.5 * GLM-4.5 * GLM-4.5 * convert_hf_to_gguf.py compatibility bugfix with GLM-4.5 From @ubergarm - ikawrakow#668 (comment) * Add ubergarm comments + my own * Revert to llama.cpp script version that produced good BF16 See: ikawrakow#668 (comment) * Support for jinja chat templates See ikawrakow#668 (comment) * GLM-4.5 llama.cpp final port * Handle TENSOR_SKIP Ported the hanges from: sammcj/llama.cpp@f129567 sammcj/llama.cpp@dcbbd2c Except op info since ik_llama.cpp doesn't support this operation. * Bugfix for TENSOR_SKIP skip loading if a tensor has the TENSOR_SKIP flag - @ubergarm via ikawrakow#668 (comment) * Update llama.cpp Restore original GGLM_ASSERT * Fix chat template detection Changes suggested by @ubergarm - ikawrakow#668 (comment) * Revert to original GGML_ASSERT
1 parent ae0ba31 commit 47c3dc7

File tree

9 files changed

+1288
-26
lines changed

9 files changed

+1288
-26
lines changed

convert_hf_to_gguf.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
618618
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
619619
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
620620
res = "chatglm-bpe"
621+
if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
622+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
623+
res = "chatglm-bpe"
624+
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
625+
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
626+
res = "glm4"
627+
if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902":
628+
# ref: https://huggingface.co/zai-org/GLM-4.5-Air, https://huggingface.co/zai-org/GLM-4.5
629+
res = "glm4"
621630
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
622631
# ref: https://huggingface.co/LumiOpen/Viking-7B
623632
res = "viking"
@@ -3948,6 +3957,137 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
39483957
return [(self.map_tensor_name(name), data_torch)]
39493958
return super().modify_tensors(data_torch, name, bid)
39503959

3960+
@Model.register("Glm4MoeForCausalLM")
3961+
class Glm4MoeModel(Model):
3962+
model_arch = gguf.MODEL_ARCH.GLM4_MOE
3963+
3964+
def __init__(self, *args, **kwargs):
3965+
super().__init__(*args, **kwargs)
3966+
# GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
3967+
self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
3968+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
3969+
3970+
def set_vocab(self):
3971+
from transformers import AutoTokenizer
3972+
3973+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
3974+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3975+
tokens, toktypes, tokpre = self.get_vocab_base()
3976+
self.gguf_writer.add_tokenizer_model("gpt2")
3977+
self.gguf_writer.add_tokenizer_pre(tokpre)
3978+
self.gguf_writer.add_token_list(tokens)
3979+
self.gguf_writer.add_token_types(toktypes)
3980+
3981+
# Special tokens
3982+
# Note: Using <|endoftext|> (151329) for eot causes endless generation
3983+
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
3984+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
3985+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
3986+
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
3987+
3988+
# Patch broken chat template
3989+
if isinstance(special_vocab.chat_template, str) and "visible_text(m.content).endswith" in special_vocab.chat_template:
3990+
special_vocab.chat_template = special_vocab.chat_template.replace(
3991+
"""{{ visible_text(m.content) }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}""",
3992+
"""{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothink")) else '' -}}""")
3993+
3994+
special_vocab.add_to_gguf(self.gguf_writer)
3995+
3996+
def set_gguf_parameters(self):
3997+
super().set_gguf_parameters()
3998+
if (rope_dim := self.hparams.get("head_dim")) is None:
3999+
rope_dim = (
4000+
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
4001+
)
4002+
self.gguf_writer.add_rope_dimension_count(
4003+
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
4004+
)
4005+
4006+
# MoE parameters - Use only routed expert count (shared experts handled separately)
4007+
if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None:
4008+
self.gguf_writer.add_expert_count(n_routed_experts)
4009+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
4010+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
4011+
if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None:
4012+
self.gguf_writer.add_expert_shared_count(n_shared_experts)
4013+
if (first_k_dense_replace := self.hparams.get("first_k_dense_replace")) is not None:
4014+
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
4015+
4016+
# Expert gating function (sigmoid for GLM4_MOE)
4017+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
4018+
4019+
# Routed scaling factor
4020+
if (routed_scaling_factor := self.hparams.get("routed_scaling_factor")) is not None:
4021+
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
4022+
4023+
# Normalise topk probabilities
4024+
if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None:
4025+
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
4026+
4027+
# NextN/MTP prediction layers
4028+
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
4029+
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
4030+
4031+
_experts: list[dict[str, Tensor]] | None = None
4032+
4033+
def modify_tensors(
4034+
self, data_torch: Tensor, name: str, bid: int | None
4035+
) -> Iterable[tuple[str, Tensor]]:
4036+
if name.startswith("model.visual."): # ignore visual part
4037+
return []
4038+
elif name.startswith("model.language_model."):
4039+
name = name.replace("language_model.", "") # for multimodal variants
4040+
4041+
# Handle main token embedding (but not layer-specific NextN embeddings)
4042+
if name == "model.embed_tokens.weight" and ".layers." not in name:
4043+
return [(self.map_tensor_name("token_embd.weight"), data_torch)]
4044+
4045+
# Handle routed experts
4046+
if name.find("mlp.experts") != -1:
4047+
n_experts = self.hparams["n_routed_experts"]
4048+
assert bid is not None
4049+
4050+
if self._experts is None:
4051+
self._experts = [{} for _ in range(self.block_count)]
4052+
4053+
self._experts[bid][name] = data_torch
4054+
4055+
if len(self._experts[bid]) >= n_experts * 3:
4056+
tensors: list[tuple[str, Tensor]] = []
4057+
4058+
# merge the experts into a single 3d tensor
4059+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
4060+
datas: list[Tensor] = []
4061+
4062+
for xid in range(n_experts):
4063+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
4064+
datas.append(self._experts[bid][ename])
4065+
del self._experts[bid][ename]
4066+
4067+
data_torch = torch.stack(datas, dim=0)
4068+
4069+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
4070+
4071+
new_name = self.map_tensor_name(merged_name)
4072+
tensors.append((new_name, data_torch))
4073+
return tensors
4074+
else:
4075+
return []
4076+
4077+
if name.endswith("e_score_correction_bias"):
4078+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
4079+
4080+
new_name = self.map_tensor_name(name)
4081+
4082+
return [(new_name, data_torch)]
4083+
4084+
def prepare_tensors(self):
4085+
super().prepare_tensors()
4086+
if self._experts is not None:
4087+
# flatten `list[dict[str, Tensor]]` into `list[str]`
4088+
experts = [k for d in self._experts for k in d.keys()]
4089+
if len(experts) > 0:
4090+
raise ValueError(f"Unprocessed experts: {experts}")
39514091

39524092
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
39534093
class ChatGLMModel(Model):

convert_hf_to_gguf_update.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ class TOKENIZER_TYPE(IntEnum):
9696
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
9797
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
9898
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
99+
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2", },
100+
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902", },
99101
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890", },
100102
]
101103

gguf-py/gguf/constants.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ class LLM:
9191
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
9292
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
9393
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
94+
NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
9495
POOLING_TYPE = "{arch}.pooling_type"
9596
LOGIT_SCALE = "{arch}.logit_scale"
9697
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
@@ -159,6 +160,13 @@ class Tokenizer:
159160
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
160161
CHAT_TEMPLATES = "tokenizer.chat_templates"
161162
# FIM/Infill special tokens constants
163+
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
164+
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
165+
FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
166+
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
167+
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
168+
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
169+
# FIM/Infill special tokens constants
162170
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
163171
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
164172
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
@@ -220,6 +228,7 @@ class MODEL_ARCH(IntEnum):
220228
OPENELM = auto()
221229
ARCTIC = auto()
222230
DEEPSEEK2 = auto()
231+
GLM4_MOE = auto()
223232
CHATGLM = auto()
224233
BITNET = auto()
225234
BITNET_25 = auto()
@@ -314,6 +323,12 @@ class MODEL_TENSOR(IntEnum):
314323
ENC_FFN_DOWN = auto()
315324
ENC_FFN_UP = auto()
316325
ENC_OUTPUT_NORM = auto()
326+
NEXTN_EH_PROJ = auto() # nextn tensors (glm4moe)
327+
NEXTN_EMBED_TOKENS = auto() # nextn tensors (glm4moe)
328+
NEXTN_ENORM = auto() # nextn tensors (glm4moe)
329+
NEXTN_HNORM = auto() # nextn tensors (glm4moe)
330+
NEXTN_SHARED_HEAD_HEAD = auto() # nextn tensors (glm4moe)
331+
NEXTN_SHARED_HEAD_NORM = auto() # nextn tensors (glm4moe)
317332

318333

319334
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -358,6 +373,7 @@ class MODEL_TENSOR(IntEnum):
358373
MODEL_ARCH.ARCTIC: "arctic",
359374
MODEL_ARCH.DEEPSEEK2: "deepseek2",
360375
MODEL_ARCH.CHATGLM: "chatglm",
376+
MODEL_ARCH.GLM4_MOE: "glm4moe",
361377
MODEL_ARCH.BITNET: "bitnet",
362378
MODEL_ARCH.BITNET_25: "bitnet-25",
363379
MODEL_ARCH.T5: "t5",
@@ -451,6 +467,13 @@ class MODEL_TENSOR(IntEnum):
451467
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
452468
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
453469
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
470+
# NextN/MTP
471+
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
472+
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
473+
MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm",
474+
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
475+
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
476+
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
454477
}
455478

456479
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1070,6 +1093,37 @@ class MODEL_TENSOR(IntEnum):
10701093
MODEL_TENSOR.FFN_DOWN,
10711094
MODEL_TENSOR.FFN_UP,
10721095
],
1096+
MODEL_ARCH.GLM4_MOE: [
1097+
MODEL_TENSOR.TOKEN_EMBD,
1098+
MODEL_TENSOR.OUTPUT_NORM,
1099+
MODEL_TENSOR.OUTPUT,
1100+
MODEL_TENSOR.ATTN_NORM,
1101+
MODEL_TENSOR.ATTN_POST_NORM,
1102+
MODEL_TENSOR.ATTN_Q,
1103+
MODEL_TENSOR.ATTN_K,
1104+
MODEL_TENSOR.ATTN_V,
1105+
MODEL_TENSOR.ATTN_OUT,
1106+
MODEL_TENSOR.ATTN_Q_NORM,
1107+
MODEL_TENSOR.ATTN_K_NORM,
1108+
MODEL_TENSOR.FFN_GATE,
1109+
MODEL_TENSOR.FFN_DOWN,
1110+
MODEL_TENSOR.FFN_UP,
1111+
MODEL_TENSOR.FFN_GATE_INP,
1112+
MODEL_TENSOR.FFN_GATE_EXP,
1113+
MODEL_TENSOR.FFN_DOWN_EXP,
1114+
MODEL_TENSOR.FFN_UP_EXP,
1115+
MODEL_TENSOR.FFN_GATE_SHEXP,
1116+
MODEL_TENSOR.FFN_DOWN_SHEXP,
1117+
MODEL_TENSOR.FFN_UP_SHEXP,
1118+
MODEL_TENSOR.FFN_EXP_PROBS_B,
1119+
# NextN/MTP tensors - preserved but unused
1120+
MODEL_TENSOR.NEXTN_EH_PROJ,
1121+
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
1122+
MODEL_TENSOR.NEXTN_ENORM,
1123+
MODEL_TENSOR.NEXTN_HNORM,
1124+
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
1125+
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
1126+
],
10731127
MODEL_ARCH.BITNET: [
10741128
MODEL_TENSOR.ATTN_Q,
10751129
MODEL_TENSOR.ATTN_K,
@@ -1633,6 +1687,14 @@ def get_type(val: Any) -> GGUFValueType:
16331687
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
16341688
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
16351689
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
1690+
1691+
KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
1692+
KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
1693+
KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
1694+
KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
1695+
KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
1696+
KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
1697+
16361698
KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
16371699
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
16381700
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,9 @@ def add_expert_weights_norm(self, value: bool) -> None:
677677
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
678678
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
679679

680+
def add_nextn_predict_layers(self, count: int) -> None:
681+
self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
682+
680683
def add_layer_norm_eps(self, value: float) -> None:
681684
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
682685

gguf-py/gguf/tensor_mapping.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,31 @@ class TensorNameMap:
592592
MODEL_TENSOR.ENC_OUTPUT_NORM: (
593593
"encoder.final_layer_norm", # t5
594594
),
595+
596+
# NextN/MTP tensors for GLM4_MOE
597+
MODEL_TENSOR.NEXTN_EH_PROJ: (
598+
"model.layers.{bid}.eh_proj",
599+
),
600+
601+
MODEL_TENSOR.NEXTN_EMBED_TOKENS: (
602+
"model.layers.{bid}.embed_tokens",
603+
),
604+
605+
MODEL_TENSOR.NEXTN_ENORM: (
606+
"model.layers.{bid}.enorm",
607+
),
608+
609+
MODEL_TENSOR.NEXTN_HNORM: (
610+
"model.layers.{bid}.hnorm",
611+
),
612+
613+
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: (
614+
"model.layers.{bid}.shared_head.head",
615+
),
616+
617+
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: (
618+
"model.layers.{bid}.shared_head.norm",
619+
),
595620
}
596621

597622
# architecture-specific block mappings

0 commit comments

Comments
 (0)