@@ -7995,6 +7995,121 @@ def prepare_tensors(self):
79957995 raise ValueError (f"Unprocessed experts: { experts } " )
79967996
79977997
7998+ @ModelBase .register ("GroveMoeForCausalLM" , "modeling_grove_moe.GroveMoeForCausalLM" )
7999+ class GroveMoeModel (TextModel ):
8000+ model_arch = gguf .MODEL_ARCH .GROVEMOE
8001+
8002+ def set_gguf_parameters (self ):
8003+ super ().set_gguf_parameters ()
8004+ if (n_experts := self .hparams .get ("num_experts" )) is not None :
8005+ self .gguf_writer .add_expert_count (n_experts )
8006+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
8007+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
8008+ logger .info (f"gguf: expert feed forward length = { moe_intermediate_size } " )
8009+ # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
8010+ self .gguf_writer .add_expert_chunk_feed_forward_length (self .hparams .get ("head_dim" ) or 128 )
8011+ # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
8012+ self .gguf_writer .add_experts_per_group (2 )
8013+ # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
8014+ self .gguf_writer .add_expert_group_scale (0.05 )
8015+ # YaRN is not enabled by default
8016+ # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
8017+ rope_scaling = self .hparams .get ("rope_scaling" ) or {}
8018+ if rope_scaling .get ("rope_type" , rope_scaling .get ("type" )) == "yarn" and "factor" in rope_scaling :
8019+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
8020+ self .gguf_writer .add_rope_scaling_factor (rope_scaling ["factor" ])
8021+ self .gguf_writer .add_rope_scaling_orig_ctx_len (rope_scaling ["original_max_position_embeddings" ])
8022+
8023+ _experts : list [dict [str , Tensor ]] | None = None
8024+ _chunk_experts : list [dict [str , Tensor ]] | None = None
8025+
8026+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
8027+ if name .endswith (".expert_bias" ):
8028+ # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
8029+ return []
8030+
8031+ # process the experts separately
8032+ if name .find ("chunk_experts" ) != - 1 :
8033+ n_experts = self .hparams ["num_experts" ] // 2 # see add_experts_per_group
8034+ assert bid is not None
8035+
8036+ if self ._chunk_experts is None :
8037+ self ._chunk_experts = [{} for _ in range (self .block_count )]
8038+
8039+ self ._chunk_experts [bid ][name ] = data_torch
8040+
8041+ if len (self ._chunk_experts [bid ]) >= n_experts * 3 :
8042+ tensors : list [tuple [str , Tensor ]] = []
8043+
8044+ # merge the experts into a single 3d tensor
8045+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
8046+ datas : list [Tensor ] = []
8047+
8048+ for xid in range (n_experts ):
8049+ ename = f"model.layers.{ bid } .mlp.chunk_experts.{ xid } .{ w_name } .weight"
8050+ datas .append (self ._chunk_experts [bid ][ename ])
8051+ del self ._chunk_experts [bid ][ename ]
8052+
8053+ data_torch = torch .stack (datas , dim = 0 )
8054+
8055+ merged_name = f"model.layers.{ bid } .mlp.chunk_experts.{ w_name } .weight"
8056+
8057+ new_name = self .map_tensor_name (merged_name )
8058+
8059+ tensors .append ((new_name , data_torch ))
8060+ return tensors
8061+ else :
8062+ return []
8063+ elif name .find ("experts" ) != - 1 :
8064+ n_experts = self .hparams ["num_experts" ]
8065+ assert bid is not None
8066+
8067+ if self ._experts is None :
8068+ self ._experts = [{} for _ in range (self .block_count )]
8069+
8070+ self ._experts [bid ][name ] = data_torch
8071+
8072+ if len (self ._experts [bid ]) >= n_experts * 3 :
8073+ tensors : list [tuple [str , Tensor ]] = []
8074+
8075+ # merge the experts into a single 3d tensor
8076+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
8077+ datas : list [Tensor ] = []
8078+
8079+ for xid in range (n_experts ):
8080+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
8081+ datas .append (self ._experts [bid ][ename ])
8082+ del self ._experts [bid ][ename ]
8083+
8084+ data_torch = torch .stack (datas , dim = 0 )
8085+
8086+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
8087+
8088+ new_name = self .map_tensor_name (merged_name )
8089+
8090+ tensors .append ((new_name , data_torch ))
8091+ return tensors
8092+ else :
8093+ return []
8094+
8095+ return [(self .map_tensor_name (name ), data_torch )]
8096+
8097+ def prepare_tensors (self ):
8098+ super ().prepare_tensors ()
8099+
8100+ if self ._chunk_experts is not None :
8101+ # flatten `list[dict[str, Tensor]]` into `list[str]`
8102+ chunk_experts = [k for d in self ._chunk_experts for k in d .keys ()]
8103+ if len (chunk_experts ) > 0 :
8104+ raise ValueError (f"Unprocessed adjugate experts: { chunk_experts } " )
8105+
8106+ if self ._experts is not None :
8107+ # flatten `list[dict[str, Tensor]]` into `list[str]`
8108+ experts = [k for d in self ._experts for k in d .keys ()]
8109+ if len (experts ) > 0 :
8110+ raise ValueError (f"Unprocessed experts: { experts } " )
8111+
8112+
79988113@ModelBase .register ("ChameleonForConditionalGeneration" )
79998114@ModelBase .register ("ChameleonForCausalLM" ) # obsolete
80008115class ChameleonModel (TextModel ):
0 commit comments