@@ -678,6 +678,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
678678 if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
679679 # ref: https://huggingface.co/THUDM/glm-4-9b-hf
680680 res = "glm4"
681+ if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902" :
682+ # ref: https://huggingface.co/zai-org/GLM-4.5-Air
683+ res = "glm4"
681684 if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35" :
682685 # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
683686 res = "minerva-7b"
@@ -6578,6 +6581,149 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
65786581 return super ().modify_tensors (data_torch , name , bid )
65796582
65806583
6584+ @ModelBase .register ("Glm4MoeForCausalLM" )
6585+ class Glm4MoeModel (TextModel ):
6586+ model_arch = gguf .MODEL_ARCH .GLM4_MOE
6587+
6588+ def __init__ (self , * args , ** kwargs ):
6589+ super ().__init__ (* args , ** kwargs )
6590+ # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
6591+ self .block_count = self .hparams ["num_hidden_layers" ] + 1
6592+ self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
6593+
6594+ def set_vocab (self ):
6595+ from transformers import AutoTokenizer
6596+
6597+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model )
6598+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
6599+ tokens , toktypes , tokpre = self .get_vocab_base ()
6600+ self .gguf_writer .add_tokenizer_model ("gpt2" )
6601+ self .gguf_writer .add_tokenizer_pre (tokpre )
6602+ self .gguf_writer .add_token_list (tokens )
6603+ self .gguf_writer .add_token_types (toktypes )
6604+
6605+ # Special tokens
6606+ # Note: Using <|endoftext|> (151329) for eos and eot causes endless generation
6607+ special_vocab ._set_special_token ("bos" , tokenizer .get_added_vocab ()["[gMASK]" ]) # 151331
6608+ special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|user|>" ]) # 151336 - end of
6609+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ]) # 151336 - same as EOS
6610+ special_vocab ._set_special_token ("eog" , tokenizer .get_added_vocab ()["<|user|>" ]) # 151336 - same as EOS
6611+ special_vocab ._set_special_token ("unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ]) # 151329
6612+ special_vocab ._set_special_token ("eom" , tokenizer .get_added_vocab ()["<|observation|>" ]) # 151338
6613+
6614+ if "<sop>" in tokenizer .get_added_vocab ():
6615+ special_vocab ._set_special_token ("sop" , tokenizer .get_added_vocab ()["<sop>" ]) # 151333
6616+ if "<eop>" in tokenizer .get_added_vocab ():
6617+ special_vocab ._set_special_token ("eop" , tokenizer .get_added_vocab ()["<eop>" ]) # 151334
6618+
6619+ special_vocab .add_to_gguf (self .gguf_writer )
6620+
6621+ def set_gguf_parameters (self ):
6622+ super ().set_gguf_parameters ()
6623+ if (rope_dim := self .hparams .get ("head_dim" )) is None :
6624+ rope_dim = (
6625+ self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
6626+ )
6627+ self .gguf_writer .add_rope_dimension_count (
6628+ int (rope_dim * self .hparams .get ("partial_rotary_factor" , 0.5 ))
6629+ )
6630+
6631+ # MoE parameters - Use only routed expert count (shared experts handled separately)
6632+ if (n_routed_experts := self .hparams .get ("n_routed_experts" )) is not None :
6633+ self .gguf_writer .add_expert_count (n_routed_experts )
6634+ if (num_experts_per_tok := self .hparams .get ("num_experts_per_tok" )) is not None :
6635+ self .gguf_writer .add_expert_used_count (num_experts_per_tok )
6636+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
6637+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
6638+ if (n_shared_experts := self .hparams .get ("n_shared_experts" )) is not None :
6639+ self .gguf_writer .add_expert_shared_count (n_shared_experts )
6640+ if (first_k_dense_replace := self .hparams .get ("first_k_dense_replace" )) is not None :
6641+ self .gguf_writer .add_leading_dense_block_count (first_k_dense_replace )
6642+
6643+ # Expert gating function (sigmoid for GLM4_MOE)
6644+ self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SIGMOID )
6645+
6646+ # Routed scaling factor
6647+ if (routed_scaling_factor := self .hparams .get ("routed_scaling_factor" )) is not None :
6648+ self .gguf_writer .add_expert_weights_scale (routed_scaling_factor )
6649+
6650+ # Normalise topk probabilities
6651+ if (norm_topk_prob := self .hparams .get ("norm_topk_prob" )) is not None :
6652+ self .gguf_writer .add_expert_weights_norm (norm_topk_prob )
6653+
6654+ _experts : list [dict [str , Tensor ]] | None = None
6655+
6656+ def modify_tensors (
6657+ self , data_torch : Tensor , name : str , bid : int | None
6658+ ) -> Iterable [tuple [str , Tensor ]]:
6659+ if name .startswith ("model.visual." ): # ignore visual part
6660+ return []
6661+ elif name .startswith ("model.language_model." ):
6662+ name = name .replace ("language_model." , "" ) # for multimodal variants
6663+
6664+ # Handle main token embedding (but not layer-specific NextN embeddings)
6665+ if name == "model.embed_tokens.weight" and ".layers." not in name :
6666+ return [(self .map_tensor_name ("token_embd.weight" ), data_torch )]
6667+
6668+ # Handle routed experts
6669+ if name .find ("mlp.experts" ) != - 1 :
6670+ n_experts = self .hparams ["n_routed_experts" ]
6671+ assert bid is not None
6672+
6673+ if self ._experts is None :
6674+ self ._experts = [{} for _ in range (self .block_count )]
6675+
6676+ self ._experts [bid ][name ] = data_torch
6677+
6678+ if len (self ._experts [bid ]) >= n_experts * 3 :
6679+ tensors : list [tuple [str , Tensor ]] = []
6680+
6681+ # merge the experts into a single 3d tensor
6682+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
6683+ datas : list [Tensor ] = []
6684+
6685+ for xid in range (n_experts ):
6686+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
6687+ datas .append (self ._experts [bid ][ename ])
6688+ del self ._experts [bid ][ename ]
6689+
6690+ data_torch = torch .stack (datas , dim = 0 )
6691+
6692+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
6693+
6694+ new_name = self .map_tensor_name (merged_name )
6695+ tensors .append ((new_name , data_torch ))
6696+ return tensors
6697+ else :
6698+ return []
6699+
6700+ if name .endswith ("e_score_correction_bias" ):
6701+ name = name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
6702+
6703+ # Handle special NextN tensors - preserve for future MTP support
6704+ if (
6705+ ".embed_tokens." in name
6706+ or ".shared_head." in name
6707+ or ".eh_proj." in name
6708+ or ".enorm." in name
6709+ or ".hnorm." in name
6710+ ):
6711+ new_name = name .replace ("model.layers." , "blk." ).replace ("model." , "" ).replace (".weight" , "" )
6712+ return [(new_name , data_torch )]
6713+
6714+ new_name = self .map_tensor_name (name )
6715+
6716+ return [(new_name , data_torch )]
6717+
6718+ def prepare_tensors (self ):
6719+ super ().prepare_tensors ()
6720+ if self ._experts is not None :
6721+ # flatten `list[dict[str, Tensor]]` into `list[str]`
6722+ experts = [k for d in self ._experts for k in d .keys ()]
6723+ if len (experts ) > 0 :
6724+ raise ValueError (f"Unprocessed experts: { experts } " )
6725+
6726+
65816727@ModelBase .register ("GlmForCausalLM" , "ChatGLMModel" , "ChatGLMForConditionalGeneration" )
65826728class ChatGLMModel (TextModel ):
65836729 model_arch = gguf .MODEL_ARCH .CHATGLM
@@ -6594,7 +6740,7 @@ def set_vocab_chatglm3(self):
65946740 vocab_size = hparams .get ("padded_vocab_size" , len (tokenizer .get_vocab ()))
65956741 assert max (tokenizer .get_vocab ().values ()) < vocab_size
65966742 role_special_tokens = ["<|system|>" , "<|user|>" , "<|assistant|>" , "<|observation|>" ]
6597- special_tokens = ["[MASK]" , "[gMASK]" , "[sMASK]" , " sop" , "eop" ] + role_special_tokens
6743+ special_tokens = ["[MASK]" , "[gMASK]" , "sop" , "eop" ] + role_special_tokens
65986744 for token_id in range (vocab_size ):
65996745 piece = tokenizer ._convert_id_to_token (token_id )
66006746 if token_id == 0 :
0 commit comments