@@ -7654,6 +7654,121 @@ def prepare_tensors(self):
76547654 raise ValueError (f"Unprocessed experts: { experts } " )
76557655
76567656
7657+ @ModelBase .register ("GroveMoeForCausalLM" , "modeling_grove_moe.GroveMoeForCausalLM" )
7658+ class GroveMoeModel (TextModel ):
7659+ model_arch = gguf .MODEL_ARCH .GROVEMOE
7660+
7661+ def set_gguf_parameters (self ):
7662+ super ().set_gguf_parameters ()
7663+ if (n_experts := self .hparams .get ("num_experts" )) is not None :
7664+ self .gguf_writer .add_expert_count (n_experts )
7665+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
7666+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
7667+ logger .info (f"gguf: expert feed forward length = { moe_intermediate_size } " )
7668+ # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
7669+ self .gguf_writer .add_expert_chunk_feed_forward_length (self .hparams .get ("head_dim" ) or 128 )
7670+ # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
7671+ self .gguf_writer .add_experts_per_group (2 )
7672+ # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
7673+ self .gguf_writer .add_expert_group_scale (0.05 )
7674+ # YaRN is not enabled by default
7675+ # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
7676+ rope_scaling = self .hparams .get ("rope_scaling" ) or {}
7677+ if rope_scaling .get ("rope_type" , rope_scaling .get ("type" )) == "yarn" and "factor" in rope_scaling :
7678+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
7679+ self .gguf_writer .add_rope_scaling_factor (rope_scaling ["factor" ])
7680+ self .gguf_writer .add_rope_scaling_orig_ctx_len (rope_scaling ["original_max_position_embeddings" ])
7681+
7682+ _experts : list [dict [str , Tensor ]] | None = None
7683+ _chunk_experts : list [dict [str , Tensor ]] | None = None
7684+
7685+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
7686+ if name .endswith (".expert_bias" ):
7687+ # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
7688+ return []
7689+
7690+ # process the experts separately
7691+ if name .find ("chunk_experts" ) != - 1 :
7692+ n_experts = self .hparams ["num_experts" ] // 2 # see add_experts_per_group
7693+ assert bid is not None
7694+
7695+ if self ._chunk_experts is None :
7696+ self ._chunk_experts = [{} for _ in range (self .block_count )]
7697+
7698+ self ._chunk_experts [bid ][name ] = data_torch
7699+
7700+ if len (self ._chunk_experts [bid ]) >= n_experts * 3 :
7701+ tensors : list [tuple [str , Tensor ]] = []
7702+
7703+ # merge the experts into a single 3d tensor
7704+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
7705+ datas : list [Tensor ] = []
7706+
7707+ for xid in range (n_experts ):
7708+ ename = f"model.layers.{ bid } .mlp.chunk_experts.{ xid } .{ w_name } .weight"
7709+ datas .append (self ._chunk_experts [bid ][ename ])
7710+ del self ._chunk_experts [bid ][ename ]
7711+
7712+ data_torch = torch .stack (datas , dim = 0 )
7713+
7714+ merged_name = f"model.layers.{ bid } .mlp.chunk_experts.{ w_name } .weight"
7715+
7716+ new_name = self .map_tensor_name (merged_name )
7717+
7718+ tensors .append ((new_name , data_torch ))
7719+ return tensors
7720+ else :
7721+ return []
7722+ elif name .find ("experts" ) != - 1 :
7723+ n_experts = self .hparams ["num_experts" ]
7724+ assert bid is not None
7725+
7726+ if self ._experts is None :
7727+ self ._experts = [{} for _ in range (self .block_count )]
7728+
7729+ self ._experts [bid ][name ] = data_torch
7730+
7731+ if len (self ._experts [bid ]) >= n_experts * 3 :
7732+ tensors : list [tuple [str , Tensor ]] = []
7733+
7734+ # merge the experts into a single 3d tensor
7735+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
7736+ datas : list [Tensor ] = []
7737+
7738+ for xid in range (n_experts ):
7739+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
7740+ datas .append (self ._experts [bid ][ename ])
7741+ del self ._experts [bid ][ename ]
7742+
7743+ data_torch = torch .stack (datas , dim = 0 )
7744+
7745+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
7746+
7747+ new_name = self .map_tensor_name (merged_name )
7748+
7749+ tensors .append ((new_name , data_torch ))
7750+ return tensors
7751+ else :
7752+ return []
7753+
7754+ return [(self .map_tensor_name (name ), data_torch )]
7755+
7756+ def prepare_tensors (self ):
7757+ super ().prepare_tensors ()
7758+
7759+ if self ._chunk_experts is not None :
7760+ # flatten `list[dict[str, Tensor]]` into `list[str]`
7761+ chunk_experts = [k for d in self ._chunk_experts for k in d .keys ()]
7762+ if len (chunk_experts ) > 0 :
7763+ raise ValueError (f"Unprocessed adjugate experts: { chunk_experts } " )
7764+
7765+ if self ._experts is not None :
7766+ # flatten `list[dict[str, Tensor]]` into `list[str]`
7767+ experts = [k for d in self ._experts for k in d .keys ()]
7768+ if len (experts ) > 0 :
7769+ raise ValueError (f"Unprocessed experts: { experts } " )
7770+
7771+
76577772@ModelBase .register ("ChameleonForConditionalGeneration" )
76587773@ModelBase .register ("ChameleonForCausalLM" ) # obsolete
76597774class ChameleonModel (TextModel ):
0 commit comments