@@ -9015,6 +9015,75 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
90159015 return [(self .map_tensor_name (name ), data_torch )]
90169016
90179017
9018+ @ModelBase .register ("Lfm2MoeForCausalLM" )
9019+ class LFM2MoeModel (TextModel ):
9020+ model_arch = gguf .MODEL_ARCH .LFM2MOE
9021+
9022+ def set_gguf_parameters (self ):
9023+ # set num_key_value_heads only for attention layers
9024+ self .hparams ["num_key_value_heads" ] = [
9025+ self .hparams ["num_key_value_heads" ] if layer_type == "full_attention" else 0
9026+ for layer_type in self .hparams ["layer_types" ]
9027+ ]
9028+
9029+ super ().set_gguf_parameters ()
9030+
9031+ self .gguf_writer .add_expert_count (self .hparams ["num_experts" ])
9032+ self .gguf_writer .add_expert_feed_forward_length (self .hparams ["moe_intermediate_size" ])
9033+ self .gguf_writer .add_leading_dense_block_count (self .hparams ["num_dense_layers" ])
9034+ self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SIGMOID )
9035+
9036+ self .gguf_writer .add_vocab_size (self .hparams ["vocab_size" ])
9037+ self .gguf_writer .add_shortconv_l_cache (self .hparams ["conv_L_cache" ])
9038+
9039+ # cache for experts weights for merging
9040+ _experts_cache : dict [int , dict [str , Tensor ]] = {}
9041+
9042+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
9043+ # conv op requires 2d tensor
9044+ if 'conv.conv' in name :
9045+ data_torch = data_torch .squeeze (1 )
9046+
9047+ if name .endswith (".expert_bias" ):
9048+ name = name .replace (".expert_bias" , ".expert_bias.bias" )
9049+
9050+ # merge expert weights
9051+ if 'experts' in name :
9052+ n_experts = self .hparams ["num_experts" ]
9053+ assert bid is not None
9054+
9055+ expert_cache = self ._experts_cache .setdefault (bid , {})
9056+ expert_cache [name ] = data_torch
9057+ expert_weights = ["w1" , "w2" , "w3" ]
9058+
9059+ # not enough expert weights to merge
9060+ if len (expert_cache ) < n_experts * len (expert_weights ):
9061+ return []
9062+
9063+ tensors : list [tuple [str , Tensor ]] = []
9064+ for w_name in expert_weights :
9065+ datas : list [Tensor ] = []
9066+
9067+ for xid in range (n_experts ):
9068+ ename = f"model.layers.{ bid } .feed_forward.experts.{ xid } .{ w_name } .weight"
9069+ datas .append (expert_cache [ename ])
9070+ del expert_cache [ename ]
9071+
9072+ data_torch = torch .stack (datas , dim = 0 )
9073+ merged_name = f"layers.{ bid } .feed_forward.experts.{ w_name } .weight"
9074+ new_name = self .map_tensor_name (merged_name )
9075+ tensors .append ((new_name , data_torch ))
9076+
9077+ del self ._experts_cache [bid ]
9078+ return tensors
9079+
9080+ return [(self .map_tensor_name (name ), data_torch )]
9081+
9082+ def prepare_tensors (self ):
9083+ super ().prepare_tensors ()
9084+ assert not self ._experts_cache
9085+
9086+
90189087@ModelBase .register ("Lfm2VlForConditionalGeneration" )
90199088class LFM2VLModel (MmprojModel ):
90209089 def __init__ (self , * args , ** kwargs ):
0 commit comments