@@ -7828,6 +7828,28 @@ def prepare_tensors(self):
78287828class BailingMoeV2Model (TextModel ):
78297829 model_arch = gguf .MODEL_ARCH .BAILINGMOE_V2
78307830
7831+ @staticmethod
7832+ def permute (
7833+ weights : Tensor , n_head : int , n_head_kv : int | None , rope_dim : int | None
7834+ ):
7835+ if n_head_kv is not None and n_head != n_head_kv :
7836+ n_head = n_head_kv
7837+ if rope_dim is None :
7838+ rope_dim = weights .shape [0 ] // n_head
7839+ weights_rope , weights_nope = weights .reshape (
7840+ n_head , weights .shape [0 ] // n_head , * weights .shape [1 :]
7841+ ).split ([rope_dim , weights .shape [0 ] // n_head - rope_dim ], dim = 1 )
7842+ return torch .cat (
7843+ [
7844+ weights_rope .reshape (
7845+ n_head , 2 , rope_dim // 2 , * weights_rope .shape [2 :]
7846+ )
7847+ .swapaxes (1 , 2 )
7848+ .reshape (weights_rope .shape ),
7849+ weights_nope ,
7850+ ], dim = 1
7851+ ).reshape (weights .shape )
7852+
78317853 def set_vocab (self ):
78327854 self ._set_vocab_gpt2 ()
78337855
@@ -7867,6 +7889,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
78677889 if match and int (match .group (1 )) >= block_count :
78687890 return []
78697891
7892+ rope_dim = int (self .hparams ['partial_rotary_factor' ] * self .hparams ['head_dim' ])
78707893 if name .endswith ("query_key_value.weight" ):
78717894 n_head = self .hparams ["num_attention_heads" ]
78727895 n_kv_head = self .hparams .get ("num_key_value_heads" )
@@ -7876,10 +7899,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
78767899 q , k , v = data_torch .split ([n_head * head_dim , n_kv_head * head_dim , n_kv_head * head_dim ], dim = - 2 )
78777900
78787901 return [
7879- (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_Q , bid ), q ),
7880- (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_K , bid ), k ),
7902+ (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_Q , bid ), BailingMoeV2Model . permute ( q , n_head , n_head , rope_dim ) ),
7903+ (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_K , bid ), BailingMoeV2Model . permute ( k , n_head , n_kv_head , rope_dim ) ),
78817904 (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_V , bid ), v )
78827905 ]
7906+ elif "attention.key_layernorm" in name or "attention.query_layernorm" in name :
7907+ mapping = {
7908+ "attention.key_layernorm" : "self_attn.key_layernorm" ,
7909+ "attention.query_layernorm" : "self_attn.query_layernorm" ,
7910+ }
7911+ for k , v in mapping .items ():
7912+ name = name .replace (k , v )
7913+ return [(self .map_tensor_name (name ), BailingMoeV2Model .permute (data_torch , 1 , 1 , rope_dim ))]
78837914 elif name .find ("mlp.experts" ) != - 1 :
78847915 n_experts = self .hparams ["num_experts" ]
78857916 assert bid is not None
@@ -7912,10 +7943,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
79127943 return tensors
79137944
79147945 pre_tensor_name_mapping = {
7915- 'attention.dense' : 'self_attn.dense' ,
7916- 'attention.key_layernorm' : 'self_attn.key_layernorm' ,
7917- 'attention.query_layernorm' : 'self_attn.query_layernorm' ,
7918- 'mlp.gate.expert_bias' : 'mlp.gate.e_score_correction.bias' ,
7946+ "attention.dense" : "self_attn.dense" ,
7947+ "mlp.gate.expert_bias" : "mlp.gate.e_score_correction.bias" ,
79197948 }
79207949 for k , v in pre_tensor_name_mapping .items ():
79217950 name = name .replace (k , v )
0 commit comments