@@ -7828,28 +7828,6 @@ def prepare_tensors(self):
78287828class BailingMoeV2Model (TextModel ):
78297829 model_arch = gguf .MODEL_ARCH .BAILINGMOE_V2
78307830
7831- @staticmethod
7832- def permute (
7833- weights : Tensor , n_head : int , n_head_kv : int | None , rope_dim : int | None
7834- ):
7835- if n_head_kv is not None and n_head != n_head_kv :
7836- n_head = n_head_kv
7837- if rope_dim is None :
7838- rope_dim = weights .shape [0 ] // n_head
7839- weights_rope , weights_nope = weights .reshape (
7840- n_head , weights .shape [0 ] // n_head , * weights .shape [1 :]
7841- ).split ([rope_dim , weights .shape [0 ] // n_head - rope_dim ], dim = 1 )
7842- return torch .cat (
7843- [
7844- weights_rope .reshape (
7845- n_head , 2 , rope_dim // 2 , * weights_rope .shape [2 :]
7846- )
7847- .swapaxes (1 , 2 )
7848- .reshape (weights_rope .shape ),
7849- weights_nope ,
7850- ], dim = 1
7851- ).reshape (weights .shape )
7852-
78537831 def set_vocab (self ):
78547832 self ._set_vocab_gpt2 ()
78557833
@@ -7889,7 +7867,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
78897867 if match and int (match .group (1 )) >= block_count :
78907868 return []
78917869
7892- rope_dim = int (self .hparams ['partial_rotary_factor' ] * self .hparams ['head_dim' ])
78937870 if name .endswith ("query_key_value.weight" ):
78947871 n_head = self .hparams ["num_attention_heads" ]
78957872 n_kv_head = self .hparams .get ("num_key_value_heads" )
@@ -7899,18 +7876,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
78997876 q , k , v = data_torch .split ([n_head * head_dim , n_kv_head * head_dim , n_kv_head * head_dim ], dim = - 2 )
79007877
79017878 return [
7902- (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_Q , bid ), BailingMoeV2Model . permute ( q , n_head , n_head , rope_dim ) ),
7903- (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_K , bid ), BailingMoeV2Model . permute ( k , n_head , n_kv_head , rope_dim ) ),
7879+ (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_Q , bid ), q ),
7880+ (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_K , bid ), k ),
79047881 (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_V , bid ), v )
79057882 ]
7906- elif "attention.key_layernorm" in name or "attention.query_layernorm" in name :
7907- mapping = {
7908- "attention.key_layernorm" : "self_attn.key_layernorm" ,
7909- "attention.query_layernorm" : "self_attn.query_layernorm" ,
7910- }
7911- for k , v in mapping .items ():
7912- name = name .replace (k , v )
7913- return [(self .map_tensor_name (name ), BailingMoeV2Model .permute (data_torch , 1 , 1 , rope_dim ))]
79147883 elif name .find ("mlp.experts" ) != - 1 :
79157884 n_experts = self .hparams ["num_experts" ]
79167885 assert bid is not None
@@ -7945,6 +7914,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
79457914 pre_tensor_name_mapping = {
79467915 "attention.dense" : "self_attn.dense" ,
79477916 "mlp.gate.expert_bias" : "mlp.gate.e_score_correction.bias" ,
7917+ "attention.key_layernorm" : "self_attn.key_layernorm" ,
7918+ "attention.query_layernorm" : "self_attn.query_layernorm" ,
79487919 }
79497920 for k , v in pre_tensor_name_mapping .items ():
79507921 name = name .replace (k , v )
0 commit comments