@@ -735,6 +735,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
735735 if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406" :
736736 # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
737737 res = "llama4"
738+ if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
739+ # ref: https://huggingface.co/THUDM/glm-4-9b-hf
740+ res = "glm4"
738741
739742 if res is None :
740743 logger .warning ("\n " )
@@ -1750,7 +1753,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
17501753
17511754 low_freq_wavelen = old_context_len / low_freq_factor
17521755 high_freq_wavelen = old_context_len / high_freq_factor
1753- assert low_freq_wavelen != high_freq_wavelen
1756+ # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4
17541757
17551758 rope_factors = []
17561759 for freq in freqs :
@@ -1806,10 +1809,6 @@ def set_gguf_parameters(self):
18061809 self .gguf_writer .add_expert_feed_forward_length (self .hparams ["intermediate_size_moe" ])
18071810
18081811 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ):
1809- name = name .replace ("language_model." , "" )
1810- name = name .replace ("feed_forward." , "mlp." ) # a bit hacky for now
1811- name = name .replace (".router.weight" , ".gate.weight" ) # a bit hacky for now
1812-
18131812 # split the gate_up into gate and up
18141813 if "gate_up_proj" in name :
18151814 name_up = name .replace ("gate_up_proj" , "up_proj.weight" )
@@ -4423,6 +4422,10 @@ def set_vocab(self):
44234422 self ._set_vocab_gpt2 ()
44244423
44254424 def set_gguf_parameters (self ):
4425+
4426+ # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
4427+ self .hparams ["num_key_value_heads" ] = 1
4428+
44264429 super ().set_gguf_parameters ()
44274430 hparams = self .hparams
44284431
@@ -4431,8 +4434,13 @@ def set_gguf_parameters(self):
44314434 if "q_lora_rank" in hparams and hparams ["q_lora_rank" ] is not None :
44324435 self .gguf_writer .add_q_lora_rank (hparams ["q_lora_rank" ])
44334436 self .gguf_writer .add_kv_lora_rank (hparams ["kv_lora_rank" ])
4434- self .gguf_writer .add_key_length (hparams ["qk_nope_head_dim" ] + hparams ["qk_rope_head_dim" ])
4435- self .gguf_writer .add_value_length (hparams ["v_head_dim" ])
4437+
4438+ # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
4439+ self .gguf_writer .add_key_length (hparams ["kv_lora_rank" ] + hparams ["qk_rope_head_dim" ])
4440+ self .gguf_writer .add_value_length (hparams ["kv_lora_rank" ])
4441+ self .gguf_writer .add_key_length_mla (hparams ["qk_nope_head_dim" ] + hparams ["qk_rope_head_dim" ])
4442+ self .gguf_writer .add_value_length_mla (hparams ["v_head_dim" ])
4443+
44364444 self .gguf_writer .add_expert_feed_forward_length (hparams ["moe_intermediate_size" ])
44374445 self .gguf_writer .add_expert_count (hparams ["n_routed_experts" ])
44384446 self .gguf_writer .add_expert_shared_count (hparams ["n_shared_experts" ])
@@ -4501,6 +4509,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
45014509 else :
45024510 return []
45034511
4512+ # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
4513+ if name .endswith ("kv_b_proj.weight" ):
4514+ name_kb = name .replace ("kv_b_proj" , "k_b_proj" )
4515+ name_vb = name .replace ("kv_b_proj" , "v_b_proj" )
4516+
4517+ n_head_kv = self .hparams ["num_key_value_heads" ]
4518+ v_head_dim = self .hparams ["v_head_dim" ]
4519+ qk_nope_head_dim = self .hparams ["qk_nope_head_dim" ]
4520+
4521+ assert data_torch .shape [0 ] == n_head_kv * (v_head_dim + qk_nope_head_dim )
4522+
4523+ kv_b = data_torch .view (n_head_kv , v_head_dim + qk_nope_head_dim , data_torch .shape [- 1 ])
4524+ k_b , v_b = torch .split (kv_b , [qk_nope_head_dim , v_head_dim ], dim = 1 )
4525+ k_b = k_b .transpose (1 , 2 )
4526+
4527+ return [
4528+ (self .map_tensor_name (name_kb ), k_b ),
4529+ (self .map_tensor_name (name_vb ), v_b )
4530+ ]
4531+
45044532 return [(self .map_tensor_name (name ), data_torch )]
45054533
45064534 def prepare_tensors (self ):
@@ -4901,6 +4929,22 @@ def prepare_tensors(self):
49014929 self .gguf_writer .add_max_alibi_bias (self .max_alibi_bias )
49024930
49034931
4932+ @Model .register ("Glm4ForCausalLM" )
4933+ class Glm4Model (Model ):
4934+ model_arch = gguf .MODEL_ARCH .GLM4
4935+
4936+ def set_vocab (self ):
4937+ self ._set_vocab_gpt2 ()
4938+
4939+ def set_gguf_parameters (self ):
4940+ super ().set_gguf_parameters ()
4941+ if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
4942+ if self .hparams ["rope_scaling" ].get ("type" ) == "yarn" :
4943+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
4944+ self .gguf_writer .add_rope_scaling_factor (self .hparams ["rope_scaling" ]["factor" ])
4945+ self .gguf_writer .add_rope_scaling_orig_ctx_len (self .hparams ["rope_scaling" ]["original_max_position_embeddings" ])
4946+
4947+
49044948@Model .register ("GlmForCausalLM" , "ChatGLMModel" , "ChatGLMForConditionalGeneration" )
49054949class ChatGLMModel (Model ):
49064950 model_arch = gguf .MODEL_ARCH .CHATGLM
@@ -5592,7 +5636,6 @@ def main() -> None:
55925636 with torch .inference_mode ():
55935637 output_type = ftype_map [args .outtype ]
55945638 model_architecture = hparams ["architectures" ][0 ]
5595-
55965639 try :
55975640 model_class = Model .from_model_architecture (model_architecture )
55985641 except NotImplementedError :
0 commit comments