@@ -888,6 +888,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
888888 if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756" :
889889 # ref: https://huggingface.co/JetBrains/Mellum-4b-base
890890 res = "mellum"
891+ if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206" :
892+ # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
893+ res = "llada-moe"
891894
892895 if res is None :
893896 logger .warning ("\n " )
@@ -8239,6 +8242,76 @@ def prepare_tensors(self):
82398242 raise ValueError (f"Unprocessed experts: { experts } " )
82408243
82418244
8245+ @ModelBase .register ("LLaDAMoEModel" , "LLaDAMoEModelLM" )
8246+ class LLaDAMoEModel (TextModel ):
8247+ model_arch = gguf .MODEL_ARCH .LLADA_MOE
8248+
8249+ def set_gguf_parameters (self ):
8250+ super ().set_gguf_parameters ()
8251+ if (n_experts := self .hparams .get ("num_experts" )) is not None :
8252+ self .gguf_writer .add_expert_count (n_experts )
8253+
8254+ if (expert_intermediate_size := self .hparams .get ("expert_intermediate_size" )) is not None :
8255+ self .gguf_writer .add_expert_feed_forward_length (expert_intermediate_size )
8256+
8257+ # number of experts used per token (top-k)
8258+ if (n_experts_used := self .hparams .get ("num_experts_per_tok" )) is not None :
8259+ self .gguf_writer .add_expert_used_count (n_experts_used )
8260+
8261+ self .gguf_writer .add_mask_token_id (156895 )
8262+ self .gguf_writer .add_causal_attention (False )
8263+ self .gguf_writer .add_diffusion_shift_logits (False )
8264+
8265+ _experts : list [dict [str , Tensor ]] | None = None
8266+
8267+ # Copied from: Qwen2MoeModel
8268+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
8269+ # process the experts separately
8270+ if name .find ("experts" ) != - 1 :
8271+ n_experts = self .hparams ["num_experts" ]
8272+ assert bid is not None
8273+
8274+ if self ._experts is None :
8275+ self ._experts = [{} for _ in range (self .block_count )]
8276+
8277+ self ._experts [bid ][name ] = data_torch
8278+
8279+ if len (self ._experts [bid ]) >= n_experts * 3 :
8280+ tensors : list [tuple [str , Tensor ]] = []
8281+
8282+ # merge the experts into a single 3d tensor
8283+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
8284+ datas : list [Tensor ] = []
8285+
8286+ for xid in range (n_experts ):
8287+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
8288+ datas .append (self ._experts [bid ][ename ])
8289+ del self ._experts [bid ][ename ]
8290+
8291+ data_torch = torch .stack (datas , dim = 0 )
8292+
8293+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
8294+
8295+ new_name = self .map_tensor_name (merged_name )
8296+
8297+ tensors .append ((new_name , data_torch ))
8298+ return tensors
8299+ else :
8300+ return []
8301+
8302+ return [(self .map_tensor_name (name ), data_torch )]
8303+
8304+ # Copied from: Qwen2MoeModel
8305+ def prepare_tensors (self ):
8306+ super ().prepare_tensors ()
8307+
8308+ if self ._experts is not None :
8309+ # flatten `list[dict[str, Tensor]]` into `list[str]`
8310+ experts = [k for d in self ._experts for k in d .keys ()]
8311+ if len (experts ) > 0 :
8312+ raise ValueError (f"Unprocessed experts: { experts } " )
8313+
8314+
82428315@ModelBase .register ("HunYuanDenseV1ForCausalLM" )
82438316class HunYuanModel (TextModel ):
82448317 model_arch = gguf .MODEL_ARCH .HUNYUAN_DENSE
0 commit comments