@@ -888,6 +888,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
888888 if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756" :
889889 # ref: https://huggingface.co/JetBrains/Mellum-4b-base
890890 res = "mellum"
891+ if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206" :
892+ # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Instruct
893+ res = "llada-moe"
891894
892895 if res is None :
893896 logger .warning ("\n " )
@@ -8239,6 +8242,81 @@ def prepare_tensors(self):
82398242 raise ValueError (f"Unprocessed experts: { experts } " )
82408243
82418244
8245+ @ModelBase .register ("LLaDAMoEModel" , "LLaDAMoEModelLM" )
8246+ class LLaDAMoEModel (TextModel ):
8247+ model_arch = gguf .MODEL_ARCH .LLADA_MOE
8248+
8249+ def set_gguf_parameters (self ):
8250+ super ().set_gguf_parameters ()
8251+ if (n_experts := self .hparams .get ("num_experts" )) is not None :
8252+ self .gguf_writer .add_expert_count (n_experts )
8253+
8254+ if (expert_intermediate_size := self .hparams .get ("expert_intermediate_size" )) is not None :
8255+ self .gguf_writer .add_expert_feed_forward_length (expert_intermediate_size )
8256+
8257+ # number of experts used per token (top-k)
8258+ if (n_experts_used := self .hparams .get ("num_experts_per_tok" )) is not None :
8259+ self .gguf_writer .add_expert_used_count (n_experts_used )
8260+
8261+ mask_token_id = 156895
8262+ if self .hparams .get ("mask_token_id" ) is not None :
8263+ mask_token_id = self .hparams ["mask_token_id" ]
8264+
8265+ self .gguf_writer .add_mask_token_id (mask_token_id )
8266+ self .gguf_writer .add_causal_attention (False )
8267+ self .gguf_writer .add_diffusion_shift_logits (False )
8268+
8269+ _experts : list [dict [str , Tensor ]] | None = None
8270+
8271+ # Copied from: Qwen2MoeModel
8272+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
8273+ # process the experts separately
8274+ if name .find ("experts" ) != - 1 :
8275+ n_experts = self .hparams ["num_experts" ]
8276+ assert bid is not None
8277+
8278+ if self ._experts is None :
8279+ self ._experts = [{} for _ in range (self .block_count )]
8280+
8281+ self ._experts [bid ][name ] = data_torch
8282+
8283+ if len (self ._experts [bid ]) >= n_experts * 3 :
8284+ tensors : list [tuple [str , Tensor ]] = []
8285+
8286+ # merge the experts into a single 3d tensor
8287+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
8288+ datas : list [Tensor ] = []
8289+
8290+ for xid in range (n_experts ):
8291+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
8292+ datas .append (self ._experts [bid ][ename ])
8293+ del self ._experts [bid ][ename ]
8294+
8295+ data_torch = torch .stack (datas , dim = 0 )
8296+
8297+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
8298+
8299+ new_name = self .map_tensor_name (merged_name )
8300+
8301+ tensors .append ((new_name , data_torch ))
8302+ return tensors
8303+ else :
8304+ return []
8305+
8306+ return [(self .map_tensor_name (name ), data_torch )]
8307+
8308+ # Copied from: Qwen2MoeModel
8309+ def prepare_tensors (self ):
8310+ super ().prepare_tensors ()
8311+
8312+ if self ._experts is not None :
8313+ # flatten `list[dict[str, Tensor]]` into `list[str]`
8314+ experts = [k for d in self ._experts for k in d .keys ()]
8315+ if len (experts ) > 0 :
8316+ raise ValueError (f"Unprocessed experts: { experts } " )
8317+
8318+
8319+
82428320@ModelBase .register ("HunYuanDenseV1ForCausalLM" )
82438321class HunYuanModel (TextModel ):
82448322 model_arch = gguf .MODEL_ARCH .HUNYUAN_DENSE
0 commit comments