@@ -678,6 +678,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
678
678
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
679
679
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
680
680
res = "glm4"
681
+ if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902" :
682
+ # ref: https://huggingface.co/zai-org/GLM-4.5-Air
683
+ res = "glm4"
681
684
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35" :
682
685
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
683
686
res = "minerva-7b"
@@ -6696,6 +6699,139 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
6696
6699
return super ().modify_tensors (data_torch , name , bid )
6697
6700
6698
6701
6702
+ @ModelBase .register ("Glm4MoeForCausalLM" )
6703
+ class Glm4MoeModel (TextModel ):
6704
+ model_arch = gguf .MODEL_ARCH .GLM4_MOE
6705
+
6706
+ def __init__ (self , * args , ** kwargs ):
6707
+ super ().__init__ (* args , ** kwargs )
6708
+ # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
6709
+ self .block_count = self .hparams ["num_hidden_layers" ] + self .hparams .get ("num_nextn_predict_layers" , 0 )
6710
+ self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
6711
+
6712
+ def set_vocab (self ):
6713
+ from transformers import AutoTokenizer
6714
+
6715
+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model )
6716
+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
6717
+ tokens , toktypes , tokpre = self .get_vocab_base ()
6718
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
6719
+ self .gguf_writer .add_tokenizer_pre (tokpre )
6720
+ self .gguf_writer .add_token_list (tokens )
6721
+ self .gguf_writer .add_token_types (toktypes )
6722
+
6723
+ # Special tokens
6724
+ # Note: Using <|endoftext|> (151329) for eot causes endless generation
6725
+ special_vocab ._set_special_token ("bos" , tokenizer .get_added_vocab ()["[gMASK]" ]) # 151331
6726
+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ]) # 151336
6727
+ special_vocab ._set_special_token ("unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ]) # 151329
6728
+ special_vocab ._set_special_token ("eom" , tokenizer .get_added_vocab ()["<|observation|>" ]) # 151338
6729
+
6730
+ # Patch broken chat template
6731
+ if isinstance (special_vocab .chat_template , str ) and "visible_text(m.content).endswith" in special_vocab .chat_template :
6732
+ special_vocab .chat_template = special_vocab .chat_template .replace (
6733
+ """{{ visible_text(m.content) }}\n {{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}""" ,
6734
+ """{% set content = visible_text(m.content) %}{{ content }}\n {{- '/nothink' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothink")) else '' -}}""" )
6735
+
6736
+ special_vocab .add_to_gguf (self .gguf_writer )
6737
+
6738
+ def set_gguf_parameters (self ):
6739
+ super ().set_gguf_parameters ()
6740
+ if (rope_dim := self .hparams .get ("head_dim" )) is None :
6741
+ rope_dim = (
6742
+ self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
6743
+ )
6744
+ self .gguf_writer .add_rope_dimension_count (
6745
+ int (rope_dim * self .hparams .get ("partial_rotary_factor" , 0.5 ))
6746
+ )
6747
+
6748
+ # MoE parameters - Use only routed expert count (shared experts handled separately)
6749
+ if (n_routed_experts := self .hparams .get ("n_routed_experts" )) is not None :
6750
+ self .gguf_writer .add_expert_count (n_routed_experts )
6751
+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
6752
+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
6753
+ if (n_shared_experts := self .hparams .get ("n_shared_experts" )) is not None :
6754
+ self .gguf_writer .add_expert_shared_count (n_shared_experts )
6755
+ if (first_k_dense_replace := self .hparams .get ("first_k_dense_replace" )) is not None :
6756
+ self .gguf_writer .add_leading_dense_block_count (first_k_dense_replace )
6757
+
6758
+ # Expert gating function (sigmoid for GLM4_MOE)
6759
+ self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SIGMOID )
6760
+
6761
+ # Routed scaling factor
6762
+ if (routed_scaling_factor := self .hparams .get ("routed_scaling_factor" )) is not None :
6763
+ self .gguf_writer .add_expert_weights_scale (routed_scaling_factor )
6764
+
6765
+ # Normalise topk probabilities
6766
+ if (norm_topk_prob := self .hparams .get ("norm_topk_prob" )) is not None :
6767
+ self .gguf_writer .add_expert_weights_norm (norm_topk_prob )
6768
+
6769
+ # NextN/MTP prediction layers
6770
+ if (num_nextn_predict_layers := self .hparams .get ("num_nextn_predict_layers" )) is not None :
6771
+ self .gguf_writer .add_nextn_predict_layers (num_nextn_predict_layers )
6772
+
6773
+ _experts : list [dict [str , Tensor ]] | None = None
6774
+
6775
+ def modify_tensors (
6776
+ self , data_torch : Tensor , name : str , bid : int | None
6777
+ ) -> Iterable [tuple [str , Tensor ]]:
6778
+ if name .startswith ("model.visual." ): # ignore visual part
6779
+ return []
6780
+ elif name .startswith ("model.language_model." ):
6781
+ name = name .replace ("language_model." , "" ) # for multimodal variants
6782
+
6783
+ # Handle main token embedding (but not layer-specific NextN embeddings)
6784
+ if name == "model.embed_tokens.weight" and ".layers." not in name :
6785
+ return [(self .map_tensor_name ("token_embd.weight" ), data_torch )]
6786
+
6787
+ # Handle routed experts
6788
+ if name .find ("mlp.experts" ) != - 1 :
6789
+ n_experts = self .hparams ["n_routed_experts" ]
6790
+ assert bid is not None
6791
+
6792
+ if self ._experts is None :
6793
+ self ._experts = [{} for _ in range (self .block_count )]
6794
+
6795
+ self ._experts [bid ][name ] = data_torch
6796
+
6797
+ if len (self ._experts [bid ]) >= n_experts * 3 :
6798
+ tensors : list [tuple [str , Tensor ]] = []
6799
+
6800
+ # merge the experts into a single 3d tensor
6801
+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
6802
+ datas : list [Tensor ] = []
6803
+
6804
+ for xid in range (n_experts ):
6805
+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
6806
+ datas .append (self ._experts [bid ][ename ])
6807
+ del self ._experts [bid ][ename ]
6808
+
6809
+ data_torch = torch .stack (datas , dim = 0 )
6810
+
6811
+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
6812
+
6813
+ new_name = self .map_tensor_name (merged_name )
6814
+ tensors .append ((new_name , data_torch ))
6815
+ return tensors
6816
+ else :
6817
+ return []
6818
+
6819
+ if name .endswith ("e_score_correction_bias" ):
6820
+ name = name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
6821
+
6822
+ new_name = self .map_tensor_name (name )
6823
+
6824
+ return [(new_name , data_torch )]
6825
+
6826
+ def prepare_tensors (self ):
6827
+ super ().prepare_tensors ()
6828
+ if self ._experts is not None :
6829
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
6830
+ experts = [k for d in self ._experts for k in d .keys ()]
6831
+ if len (experts ) > 0 :
6832
+ raise ValueError (f"Unprocessed experts: { experts } " )
6833
+
6834
+
6699
6835
@ModelBase .register ("GlmForCausalLM" , "ChatGLMModel" , "ChatGLMForConditionalGeneration" )
6700
6836
class ChatGLMModel (TextModel ):
6701
6837
model_arch = gguf .MODEL_ARCH .CHATGLM
0 commit comments