@@ -2861,7 +2861,8 @@ def set_gguf_parameters(self):
28612861 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
28622862 num_heads = self .hparams ["num_attention_heads" ]
28632863 num_kv_heads = self .hparams ["num_key_value_heads" ]
2864- head_dim = self .hparams ["head_dim" ]
2864+ if (head_dim := self .hparams .get ("head_dim" )) is None :
2865+ head_dim = self .hparams ["hidden_size" ] // num_heads
28652866
28662867 if "ernie." in name :
28672868 name = name .replace ("ernie." , "model." )
@@ -2894,6 +2895,92 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
28942895 return [(self .map_tensor_name (name ), data_torch )]
28952896
28962897
2898+ @ModelBase .register ("Ernie4_5_MoeForCausalLM" )
2899+ class Ernie4_5MoeModel (Ernie4_5Model ):
2900+ model_arch = gguf .MODEL_ARCH .ERNIE4_5_MOE
2901+ _experts : list [dict [str , Tensor ]] | None = None
2902+
2903+ def __init__ (self , * args , ** kwargs ):
2904+ super ().__init__ (* args , ** kwargs )
2905+ self ._experts = [{} for _ in range (self .block_count )]
2906+
2907+ def set_gguf_parameters (self ):
2908+ super ().set_gguf_parameters ()
2909+ self .gguf_writer .add_expert_count (self .hparams ["moe_num_experts" ])
2910+ self .gguf_writer .add_expert_used_count (self .hparams ["moe_k" ])
2911+ self .gguf_writer .add_interleave_moe_layer_step (self .hparams ["moe_layer_interval" ])
2912+ self .gguf_writer .add_leading_dense_block_count (self .hparams ["moe_layer_start_index" ])
2913+ self .gguf_writer .add_rope_freq_base (self .hparams ["rope_theta" ])
2914+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
2915+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
2916+ if (shared_expert_intermediate_size := self .hparams .get ('intermediate_size' )) is not None and (num_key_value_heads := self .hparams .get ('num_key_value_heads' )) is not None :
2917+ self .gguf_writer .add_expert_shared_feed_forward_length (shared_expert_intermediate_size // num_key_value_heads )
2918+
2919+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2920+ # Modify correction bias name as in DeepseekV2
2921+ if name .endswith ("e_score_correction_bias" ):
2922+ name = name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
2923+
2924+ # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
2925+ match = re .match (r"model.mtp_block.(\d+)" , name )
2926+ if match :
2927+ return []
2928+
2929+ # skip all other MTP tensors for now
2930+ match = re .match (r"model.mtp_emb_norm.(\d+)" , name )
2931+ if match :
2932+ return []
2933+
2934+ match = re .match (r"model.mtp_hidden_norm.(\d+)" , name )
2935+ if match :
2936+ return []
2937+
2938+ match = re .match (r"model.mtp_linear_proj.(\d+)" , name )
2939+ if match :
2940+ return []
2941+
2942+ # process the experts separately
2943+ if name .find ("mlp.experts" ) != - 1 :
2944+ n_experts = self .hparams ["moe_num_experts" ]
2945+ assert bid is not None
2946+
2947+ if self ._experts is None :
2948+ self ._experts = [{} for _ in range (self .block_count )]
2949+
2950+ self ._experts [bid ][name ] = data_torch
2951+
2952+ if len (self ._experts [bid ]) >= n_experts * 3 :
2953+ tensors : list [tuple [str , Tensor ]] = []
2954+
2955+ # merge the experts into a single 3d tensor
2956+ for w_name in ["gate_proj" , "up_proj" , "down_proj" ]:
2957+ datas : list [Tensor ] = []
2958+
2959+ for xid in range (n_experts ):
2960+ ename_to_retrieve = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
2961+ datas .append (self ._experts [bid ][ename_to_retrieve ])
2962+ del self ._experts [bid ][ename_to_retrieve ]
2963+
2964+ data_torch = torch .stack (datas , dim = 0 )
2965+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
2966+ new_name = self .map_tensor_name (merged_name )
2967+ tensors .append ((new_name , data_torch ))
2968+
2969+ return tensors
2970+ else :
2971+ return []
2972+ return [(self .map_tensor_name (name ), data_torch )]
2973+
2974+ def prepare_tensors (self ):
2975+ super ().prepare_tensors ()
2976+
2977+ if self ._experts is not None :
2978+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2979+ experts = [k for d in self ._experts for k in d .keys ()]
2980+ if len (experts ) > 0 :
2981+ raise ValueError (f"Unprocessed experts: { experts } " )
2982+
2983+
28972984@ModelBase .register (
28982985 "Qwen2VLModel" ,
28992986 "Qwen2VLForConditionalGeneration" ,
0 commit comments