@@ -4422,6 +4422,10 @@ def set_vocab(self):
44224422 self ._set_vocab_gpt2 ()
44234423
44244424 def set_gguf_parameters (self ):
4425+
4426+ # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
4427+ self .hparams ["num_key_value_heads" ] = 1
4428+
44254429 super ().set_gguf_parameters ()
44264430 hparams = self .hparams
44274431
@@ -4430,8 +4434,13 @@ def set_gguf_parameters(self):
44304434 if "q_lora_rank" in hparams and hparams ["q_lora_rank" ] is not None :
44314435 self .gguf_writer .add_q_lora_rank (hparams ["q_lora_rank" ])
44324436 self .gguf_writer .add_kv_lora_rank (hparams ["kv_lora_rank" ])
4433- self .gguf_writer .add_key_length (hparams ["qk_nope_head_dim" ] + hparams ["qk_rope_head_dim" ])
4434- self .gguf_writer .add_value_length (hparams ["v_head_dim" ])
4437+
4438+ # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
4439+ self .gguf_writer .add_key_length (hparams ["kv_lora_rank" ] + hparams ["qk_rope_head_dim" ])
4440+ self .gguf_writer .add_value_length (hparams ["kv_lora_rank" ])
4441+ self .gguf_writer .add_key_length_mla (hparams ["qk_nope_head_dim" ] + hparams ["qk_rope_head_dim" ])
4442+ self .gguf_writer .add_value_length_mla (hparams ["v_head_dim" ])
4443+
44354444 self .gguf_writer .add_expert_feed_forward_length (hparams ["moe_intermediate_size" ])
44364445 self .gguf_writer .add_expert_count (hparams ["n_routed_experts" ])
44374446 self .gguf_writer .add_expert_shared_count (hparams ["n_shared_experts" ])
@@ -4500,6 +4509,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
45004509 else :
45014510 return []
45024511
4512+ # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
4513+ if name .endswith ("kv_b_proj.weight" ):
4514+ name_kb = name .replace ("kv_b_proj" , "k_b_proj" )
4515+ name_vb = name .replace ("kv_b_proj" , "v_b_proj" )
4516+
4517+ n_head_kv = self .hparams ["num_key_value_heads" ]
4518+ v_head_dim = self .hparams ["v_head_dim" ]
4519+ qk_nope_head_dim = self .hparams ["qk_nope_head_dim" ]
4520+
4521+ assert data_torch .shape [0 ] == n_head_kv * (v_head_dim + qk_nope_head_dim )
4522+
4523+ kv_b = data_torch .view (n_head_kv , v_head_dim + qk_nope_head_dim , data_torch .shape [- 1 ])
4524+ k_b , v_b = torch .split (kv_b , [qk_nope_head_dim , v_head_dim ], dim = 1 )
4525+ k_b = k_b .transpose (1 , 2 )
4526+
4527+ return [
4528+ (self .map_tensor_name (name_kb ), k_b ),
4529+ (self .map_tensor_name (name_vb ), v_b )
4530+ ]
4531+
45034532 return [(self .map_tensor_name (name ), data_torch )]
45044533
45054534 def prepare_tensors (self ):
0 commit comments