Skip to content

Commit c2d602b

Browse files
authored
Merge pull request #11 from Thireus/glm-4.5
Support for Glm 4.5
2 parents bb4c917 + fe552b9 commit c2d602b

File tree

4 files changed

+1045
-12
lines changed

4 files changed

+1045
-12
lines changed

convert_hf_to_gguf.py

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
618618
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
619619
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
620620
res = "chatglm-bpe"
621+
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
622+
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
623+
res = "glm4"
624+
if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902":
625+
# ref: https://huggingface.co/zai-org/GLM-4.5-Air, https://huggingface.co/zai-org/GLM-4.5
626+
res = "gpt-2"
621627
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
622628
# ref: https://huggingface.co/LumiOpen/Viking-7B
623629
res = "viking"
@@ -3948,6 +3954,214 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
39483954
return [(self.map_tensor_name(name), data_torch)]
39493955
return super().modify_tensors(data_torch, name, bid)
39503956

3957+
@Model.register("Glm4MoeForCausalLM")
3958+
class Glm4MoeModel(Model):
3959+
model_arch = gguf.MODEL_ARCH.GLM4_MOE
3960+
3961+
def __init__(self, *args, **kwargs):
3962+
super().__init__(*args, **kwargs)
3963+
# GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
3964+
self.block_count = self.hparams["num_hidden_layers"] + 1
3965+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
3966+
3967+
def set_vocab(self):
3968+
from transformers import AutoTokenizer
3969+
3970+
tokenizer = AutoTokenizer.from_pretrained(
3971+
self.dir_model, trust_remote_code=True
3972+
)
3973+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3974+
tokens, toktypes, tokpre = self.get_vocab_base()
3975+
self.gguf_writer.add_tokenizer_model("gpt2")
3976+
self.gguf_writer.add_tokenizer_pre(tokpre)
3977+
self.gguf_writer.add_token_list(tokens)
3978+
self.gguf_writer.add_token_types(toktypes)
3979+
3980+
# Set special tokens
3981+
special_vocab._set_special_token(
3982+
"eos", tokenizer.get_added_vocab()["<|endoftext|>"]
3983+
)
3984+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
3985+
special_vocab._set_special_token(
3986+
"unk", tokenizer.get_added_vocab()["<|endoftext|>"]
3987+
)
3988+
special_vocab._set_special_token(
3989+
"bos", tokenizer.get_added_vocab()["<|endoftext|>"]
3990+
)
3991+
3992+
special_vocab.add_to_gguf(self.gguf_writer)
3993+
3994+
def set_gguf_parameters(self):
3995+
super().set_gguf_parameters()
3996+
if (rope_dim := self.hparams.get("head_dim")) is None:
3997+
rope_dim = (
3998+
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
3999+
)
4000+
self.gguf_writer.add_rope_dimension_count(
4001+
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
4002+
)
4003+
4004+
# MoE parameters
4005+
if (n_experts := self.hparams.get("n_routed_experts")) is not None:
4006+
self.gguf_writer.add_expert_count(n_experts)
4007+
# Note: expert_used_count is already set by parent class using num_experts_per_tok
4008+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
4009+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
4010+
if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None:
4011+
self.gguf_writer.add_expert_shared_count(n_shared_experts)
4012+
if (first_k_dense_replace := self.hparams.get("first_k_dense_replace")) is not None:
4013+
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
4014+
4015+
# Expert gating function (sigmoid for GLM4_MOE)
4016+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
4017+
4018+
# Routed scaling factor
4019+
if (routed_scaling_factor := self.hparams.get("routed_scaling_factor")) is not None:
4020+
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
4021+
4022+
# Normalise topk probabilities
4023+
if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None:
4024+
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
4025+
4026+
_experts: list[dict[str, Tensor]] | None = None
4027+
_shared_experts: list[dict[str, Tensor]] | None = None
4028+
4029+
def modify_tensors(
4030+
self, data_torch: Tensor, name: str, bid: int | None
4031+
) -> Iterable[tuple[str, Tensor]]:
4032+
if name.startswith("model.visual."): # ignore visual part
4033+
return []
4034+
elif name.startswith("model.language_model."):
4035+
name = name.replace("language_model.", "") # for multimodal variants
4036+
4037+
# Handle main token embedding (but not layer-specific NextN embeddings)
4038+
if name == "model.embed_tokens.weight":
4039+
return [(self.map_tensor_name("token_embd.weight"), data_torch)]
4040+
4041+
# Handle routed experts
4042+
if name.find("mlp.experts") != -1 and "shared_experts" not in name:
4043+
n_experts = self.hparams["n_routed_experts"]
4044+
assert bid is not None
4045+
4046+
if self._experts is None:
4047+
self._experts = [{} for _ in range(self.block_count)]
4048+
4049+
# Extend experts array if needed (for models where actual layers > num_hidden_layers)
4050+
while len(self._experts) <= bid:
4051+
self._experts.append({})
4052+
4053+
self._experts[bid][name] = data_torch
4054+
4055+
if len(self._experts[bid]) >= n_experts * 3:
4056+
tensors: list[tuple[str, Tensor]] = []
4057+
4058+
# merge the experts into a single 3d tensor
4059+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
4060+
datas: list[Tensor] = []
4061+
4062+
for xid in range(n_experts):
4063+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
4064+
datas.append(self._experts[bid][ename])
4065+
del self._experts[bid][ename]
4066+
4067+
data_torch = torch.stack(datas, dim=0)
4068+
# Generate GGUF tensor names for merged experts
4069+
if w_name == "down_proj":
4070+
new_name = f"blk.{bid}.ffn_down_exps.weight"
4071+
elif w_name == "gate_proj":
4072+
new_name = f"blk.{bid}.ffn_gate_exps.weight"
4073+
elif w_name == "up_proj":
4074+
new_name = f"blk.{bid}.ffn_up_exps.weight"
4075+
else:
4076+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
4077+
new_name = self.map_tensor_name(merged_name)
4078+
tensors.append((new_name, data_torch))
4079+
return tensors
4080+
else:
4081+
return []
4082+
4083+
# Handle expert gating input (routing gate)
4084+
if ".mlp.gate.e_score_correction_bias" in name:
4085+
new_name = name.replace("model.layers.", "blk.").replace(
4086+
".mlp.gate.e_score_correction_bias", ".ffn_gate_inp.bias" # *NOTE* this is ".exp_probs_b" in mainline PR
4087+
)
4088+
return [(new_name, data_torch)]
4089+
elif ".mlp.gate.weight" in name:
4090+
new_name = name.replace("model.layers.", "blk.").replace(
4091+
".mlp.gate.weight", ".ffn_gate_inp.weight"
4092+
)
4093+
return [(new_name, data_torch)]
4094+
4095+
# Handle shared expert tensors
4096+
if ".mlp.shared_experts." in name:
4097+
new_name = name.replace("model.layers.", "blk.").replace(".mlp.shared_experts.", ".ffn_")
4098+
if "gate_proj" in new_name:
4099+
new_name = new_name.replace("gate_proj", "gate_shexp")
4100+
elif "down_proj" in new_name:
4101+
new_name = new_name.replace("down_proj", "down_shexp")
4102+
elif "up_proj" in new_name:
4103+
new_name = new_name.replace("up_proj", "up_shexp")
4104+
return [(new_name, data_torch)]
4105+
4106+
# Handle regular dense FFN layers (for hybrid dense/MoE architecture)
4107+
if ".mlp." in name and "experts" not in name and "_shexp" not in name:
4108+
if "gate_proj" in name:
4109+
new_name = name.replace("model.layers.", "blk.").replace(
4110+
".mlp.gate_proj.weight", ".ffn_gate.weight"
4111+
)
4112+
elif "up_proj" in name:
4113+
new_name = name.replace("model.layers.", "blk.").replace(
4114+
".mlp.up_proj.weight", ".ffn_up.weight"
4115+
)
4116+
elif "down_proj" in name:
4117+
new_name = name.replace("model.layers.", "blk.").replace(
4118+
".mlp.down_proj.weight", ".ffn_down.weight"
4119+
)
4120+
else:
4121+
new_name = name
4122+
return [(self.map_tensor_name(new_name), data_torch)]
4123+
4124+
# Handle special NextN tensors - preserve for future MTP support - See https://github.com/ggml-org/llama.cpp/pull/13236
4125+
if (
4126+
".embed_tokens." in name
4127+
or ".shared_head." in name
4128+
or ".eh_proj." in name
4129+
or ".enorm." in name
4130+
or ".hnorm." in name
4131+
):
4132+
new_name = name.replace("model.layers.", "blk.").replace("model.", "").replace(".weight", "")
4133+
# logger.debug(f"Skipping MTP tensor: {new_name}")
4134+
return [(new_name, data_torch)]
4135+
4136+
# GLM tensor mapping - handle directly without map_tensor_name
4137+
if ".input_layernorm." in name:
4138+
new_name = name.replace("model.layers.", "blk.").replace(".input_layernorm.", ".attn_norm.")
4139+
return [(new_name, data_torch)]
4140+
elif ".post_attention_layernorm." in name:
4141+
new_name = name.replace("model.layers.", "blk.").replace(".post_attention_layernorm.", ".ffn_norm.")
4142+
return [(new_name, data_torch)]
4143+
elif ".self_attn." in name:
4144+
# Map GLM self_attn to standard attention naming
4145+
new_name = name.replace("model.layers.", "blk.").replace(".self_attn.", ".attn_")
4146+
if "q_proj" in new_name:
4147+
new_name = new_name.replace("q_proj", "q")
4148+
elif "k_proj" in new_name:
4149+
new_name = new_name.replace("k_proj", "k")
4150+
elif "v_proj" in new_name:
4151+
new_name = new_name.replace("v_proj", "v")
4152+
elif "o_proj" in new_name:
4153+
new_name = new_name.replace("o_proj", "output")
4154+
return [(new_name, data_torch)]
4155+
4156+
return super().modify_tensors(data_torch, name, bid)
4157+
4158+
def prepare_tensors(self):
4159+
super().prepare_tensors()
4160+
if self._experts is not None:
4161+
# flatten `list[dict[str, Tensor]]` into `list[str]`
4162+
experts = [k for d in self._experts for k in d.keys()]
4163+
if len(experts) > 0:
4164+
raise ValueError(f"Unprocessed experts: {experts}")
39514165

39524166
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
39534167
class ChatGLMModel(Model):

gguf-py/gguf/constants.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ class MODEL_ARCH(IntEnum):
220220
OPENELM = auto()
221221
ARCTIC = auto()
222222
DEEPSEEK2 = auto()
223+
GLM4_MOE = auto()
223224
CHATGLM = auto()
224225
BITNET = auto()
225226
BITNET_25 = auto()
@@ -262,6 +263,9 @@ class MODEL_TENSOR(IntEnum):
262263
FFN_GATE_EXP = auto()
263264
FFN_DOWN_EXP = auto()
264265
FFN_UP_EXP = auto()
266+
FFN_GATE_EXPS = auto() # merged experts
267+
FFN_DOWN_EXPS = auto() # merged experts
268+
FFN_UP_EXPS = auto() # merged experts
265269
FFN_GATE_SHEXP = auto()
266270
FFN_DOWN_SHEXP = auto()
267271
FFN_UP_SHEXP = auto()
@@ -314,6 +318,12 @@ class MODEL_TENSOR(IntEnum):
314318
ENC_FFN_DOWN = auto()
315319
ENC_FFN_UP = auto()
316320
ENC_OUTPUT_NORM = auto()
321+
NEXTN_EH_PROJ = auto() # nextn tensors (glm4moe)
322+
NEXTN_EMBED_TOKENS = auto() # nextn tensors (glm4moe)
323+
NEXTN_ENORM = auto() # nextn tensors (glm4moe)
324+
NEXTN_HNORM = auto() # nextn tensors (glm4moe)
325+
NEXTN_SHARED_HEAD_HEAD = auto() # nextn tensors (glm4moe)
326+
NEXTN_SHARED_HEAD_NORM = auto() # nextn tensors (glm4moe)
317327

318328

319329
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -358,6 +368,7 @@ class MODEL_TENSOR(IntEnum):
358368
MODEL_ARCH.ARCTIC: "arctic",
359369
MODEL_ARCH.DEEPSEEK2: "deepseek2",
360370
MODEL_ARCH.CHATGLM: "chatglm",
371+
MODEL_ARCH.GLM4_MOE: "glm4moe",
361372
MODEL_ARCH.BITNET: "bitnet",
362373
MODEL_ARCH.BITNET_25: "bitnet-25",
363374
MODEL_ARCH.T5: "t5",
@@ -404,6 +415,9 @@ class MODEL_TENSOR(IntEnum):
404415
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
405416
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
406417
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
418+
MODEL_TENSOR.FFN_GATE_EXPS: "blk.{bid}.ffn_gate_exps", # merged experts
419+
MODEL_TENSOR.FFN_DOWN_EXPS: "blk.{bid}.ffn_down_exps", # merged experts
420+
MODEL_TENSOR.FFN_UP_EXPS: "blk.{bid}.ffn_up_exps", # merged experts
407421
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
408422
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
409423
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
@@ -451,6 +465,13 @@ class MODEL_TENSOR(IntEnum):
451465
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
452466
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
453467
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
468+
# NextN/MTP tensors (GLM4_MOE)
469+
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.eh_proj",
470+
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.embed_tokens",
471+
MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.enorm",
472+
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.hnorm",
473+
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.shared_head.head",
474+
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.shared_head.norm",
454475
}
455476

456477
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1070,6 +1091,36 @@ class MODEL_TENSOR(IntEnum):
10701091
MODEL_TENSOR.FFN_DOWN,
10711092
MODEL_TENSOR.FFN_UP,
10721093
],
1094+
MODEL_ARCH.GLM4_MOE: [
1095+
MODEL_TENSOR.TOKEN_EMBD,
1096+
MODEL_TENSOR.OUTPUT_NORM,
1097+
MODEL_TENSOR.OUTPUT,
1098+
MODEL_TENSOR.ATTN_NORM,
1099+
MODEL_TENSOR.ATTN_Q,
1100+
MODEL_TENSOR.ATTN_K,
1101+
MODEL_TENSOR.ATTN_V,
1102+
MODEL_TENSOR.ATTN_OUT,
1103+
MODEL_TENSOR.ATTN_Q_NORM,
1104+
MODEL_TENSOR.ATTN_K_NORM,
1105+
MODEL_TENSOR.FFN_NORM,
1106+
MODEL_TENSOR.FFN_GATE, # dense layers
1107+
MODEL_TENSOR.FFN_DOWN, # dense layers
1108+
MODEL_TENSOR.FFN_UP, # dense layers
1109+
MODEL_TENSOR.FFN_GATE_INP,
1110+
MODEL_TENSOR.FFN_GATE_EXPS,
1111+
MODEL_TENSOR.FFN_DOWN_EXPS,
1112+
MODEL_TENSOR.FFN_UP_EXPS,
1113+
MODEL_TENSOR.FFN_GATE_SHEXP,
1114+
MODEL_TENSOR.FFN_DOWN_SHEXP,
1115+
MODEL_TENSOR.FFN_UP_SHEXP,
1116+
# NextN/MTP tensors - preserved but unused
1117+
MODEL_TENSOR.NEXTN_EH_PROJ,
1118+
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
1119+
MODEL_TENSOR.NEXTN_ENORM,
1120+
MODEL_TENSOR.NEXTN_HNORM,
1121+
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
1122+
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
1123+
],
10731124
MODEL_ARCH.BITNET: [
10741125
MODEL_TENSOR.ATTN_Q,
10751126
MODEL_TENSOR.ATTN_K,

0 commit comments

Comments
 (0)