@@ -2861,7 +2861,8 @@ def set_gguf_parameters(self):
28612861 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
28622862 num_heads = self .hparams ["num_attention_heads" ]
28632863 num_kv_heads = self .hparams ["num_key_value_heads" ]
2864- head_dim = self .hparams ["head_dim" ]
2864+ if (head_dim := self .hparams .get ("head_dim" )) is None :
2865+ head_dim = self .hparams ["hidden_size" ] // num_heads
28652866
28662867 if "ernie." in name :
28672868 name = name .replace ("ernie." , "model." )
@@ -2894,6 +2895,93 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
28942895 return [(self .map_tensor_name (name ), data_torch )]
28952896
28962897
2898+ @ModelBase .register ("Ernie4_5_MoeForCausalLM" )
2899+ class Ernie4_5MoeModel (Ernie4_5Model ):
2900+ model_arch = gguf .MODEL_ARCH .ERNIE4_5_MOE
2901+ _experts : list [dict [str , Tensor ]] | None = None
2902+
2903+ def __init__ (self , * args , ** kwargs ):
2904+ super ().__init__ (* args , ** kwargs )
2905+ self ._experts = [{} for _ in range (self .block_count )]
2906+
2907+ def set_gguf_parameters (self ):
2908+ super ().set_gguf_parameters ()
2909+ self .gguf_writer .add_expert_count (self .hparams ["moe_num_experts" ])
2910+ self .gguf_writer .add_expert_used_count (self .hparams ["moe_k" ])
2911+ self .gguf_writer .add_interleave_moe_layer_step (self .hparams ["moe_layer_interval" ])
2912+ self .gguf_writer .add_leading_dense_block_count (self .hparams ["moe_layer_start_index" ])
2913+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
2914+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
2915+ if (shared_expert_count := self .hparams .get ('moe_num_shared_experts' )) is not None :
2916+ self .gguf_writer .add_expert_shared_count (shared_expert_count )
2917+ if shared_expert_count > 0 and (shared_expert_intermediate_size := self .hparams .get ('intermediate_size' )) is not None and (num_key_value_heads := self .hparams .get ('num_key_value_heads' )) is not None :
2918+ self .gguf_writer .add_expert_shared_feed_forward_length (shared_expert_intermediate_size // num_key_value_heads )
2919+
2920+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2921+ # Modify correction bias name as in DeepseekV2
2922+ if name .endswith ("e_score_correction_bias" ):
2923+ name = name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
2924+
2925+ # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
2926+ match = re .match (r"model.mtp_block.(\d+)" , name )
2927+ if match :
2928+ return []
2929+
2930+ # skip all other MTP tensors for now
2931+ match = re .match (r"model.mtp_emb_norm.(\d+)" , name )
2932+ if match :
2933+ return []
2934+
2935+ match = re .match (r"model.mtp_hidden_norm.(\d+)" , name )
2936+ if match :
2937+ return []
2938+
2939+ match = re .match (r"model.mtp_linear_proj.(\d+)" , name )
2940+ if match :
2941+ return []
2942+
2943+ # process the experts separately
2944+ if name .find ("mlp.experts" ) != - 1 :
2945+ n_experts = self .hparams ["moe_num_experts" ]
2946+ assert bid is not None
2947+
2948+ if self ._experts is None :
2949+ self ._experts = [{} for _ in range (self .block_count )]
2950+
2951+ self ._experts [bid ][name ] = data_torch
2952+
2953+ if len (self ._experts [bid ]) >= n_experts * 3 :
2954+ tensors : list [tuple [str , Tensor ]] = []
2955+
2956+ # merge the experts into a single 3d tensor
2957+ for w_name in ["gate_proj" , "up_proj" , "down_proj" ]:
2958+ datas : list [Tensor ] = []
2959+
2960+ for xid in range (n_experts ):
2961+ ename_to_retrieve = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
2962+ datas .append (self ._experts [bid ][ename_to_retrieve ])
2963+ del self ._experts [bid ][ename_to_retrieve ]
2964+
2965+ data_torch = torch .stack (datas , dim = 0 )
2966+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
2967+ new_name = self .map_tensor_name (merged_name )
2968+ tensors .append ((new_name , data_torch ))
2969+
2970+ return tensors
2971+ else :
2972+ return []
2973+ return [(self .map_tensor_name (name ), data_torch )]
2974+
2975+ def prepare_tensors (self ):
2976+ super ().prepare_tensors ()
2977+
2978+ if self ._experts is not None :
2979+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2980+ experts = [k for d in self ._experts for k in d .keys ()]
2981+ if len (experts ) > 0 :
2982+ raise ValueError (f"Unprocessed experts: { experts } " )
2983+
2984+
28972985@ModelBase .register (
28982986 "Qwen2VLModel" ,
28992987 "Qwen2VLForConditionalGeneration" ,
0 commit comments