@@ -78,7 +78,7 @@ class ModelBase:
7878 # subclasses should define this!
7979 model_arch : gguf .MODEL_ARCH
8080
81- def __init__ (self , dir_model : Path , ftype : gguf .LlamaFileType , fname_out : Path , is_big_endian : bool = False ,
81+ def __init__ (self , dir_model : Path , ftype : gguf .LlamaFileType , fname_out : Path , * , is_big_endian : bool = False ,
8282 use_temp_file : bool = False , eager : bool = False ,
8383 metadata_override : Path | None = None , model_name : str | None = None ,
8484 split_max_tensors : int = 0 , split_max_size : int = 0 , dry_run : bool = False ,
@@ -454,13 +454,6 @@ def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type
454454
455455
456456class TextModel (ModelBase ):
457- @classmethod
458- def __init_subclass__ (cls ):
459- # can't use an abstract property, because overriding it without type errors
460- # would require using decorated functions instead of simply defining the property
461- if "model_arch" not in cls .__dict__ :
462- raise TypeError (f"Missing property 'model_arch' for { cls .__name__ !r} " )
463-
464457 def set_vocab (self ):
465458 self ._set_vocab_gpt2 ()
466459
@@ -3420,32 +3413,58 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
34203413
34213414@ModelBase .register ("NomicBertModel" )
34223415class NomicBertModel (BertModel ):
3423- model_arch = gguf .MODEL_ARCH .NOMIC_BERT
3416+ def __init__ (self , dir_model : Path , ftype : gguf .LlamaFileType , fname_out : Path , ** kwargs : Any ):
3417+ hparams = kwargs .pop ("hparams" , None )
3418+ if hparams is None :
3419+ hparams = ModelBase .load_hparams (dir_model )
34243420
3425- def __init__ (self , * args , ** kwargs ):
3426- super ().__init__ (* args , ** kwargs )
3421+ self .is_moe = bool (hparams .get ("moe_every_n_layers" ))
3422+ self .model_arch = gguf .MODEL_ARCH .NOMIC_BERT_MOE if self .is_moe else gguf .MODEL_ARCH .NOMIC_BERT
3423+
3424+ super ().__init__ (dir_model , ftype , fname_out , hparams = hparams , ** kwargs )
34273425
34283426 # the HF config claims n_ctx=8192, but it uses RoPE scaling
34293427 self .hparams ["n_ctx" ] = 2048
34303428
3431- # SwigLU activation
3432- assert self . hparams [ "activation_function" ] == "swiglu"
3429+ assert self . hparams [ "activation_function" ] == "gelu" if self . is_moe else "swiglu"
3430+
34333431 # this doesn't do anything in the HF version
34343432 assert self .hparams ["causal" ] is False
3435- # no bias tensors
3436- assert self .hparams ["qkv_proj_bias" ] is False
3437- assert self .hparams ["mlp_fc1_bias" ] is False
3438- assert self .hparams ["mlp_fc2_bias" ] is False
3433+ # no bias tensors unless MoE
3434+ assert self .hparams ["qkv_proj_bias" ] == self .is_moe
3435+ assert self .hparams ["mlp_fc1_bias" ] == self .is_moe
3436+ assert self .hparams ["mlp_fc2_bias" ] == self .is_moe
3437+
34393438 # norm at end of layer
34403439 assert self .hparams ["prenorm" ] is False
34413440 # standard RoPE
34423441 assert self .hparams ["rotary_emb_fraction" ] == 1.0
34433442 assert self .hparams ["rotary_emb_interleaved" ] is False
34443443 assert self .hparams ["rotary_emb_scale_base" ] is None
34453444
3445+ def modify_tensors (self , data_torch : torch .Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , torch .Tensor ]]:
3446+ # If the tensor is an experts bias tensor, skip it by returning an empty list.
3447+ if "mlp.experts.bias" in name :
3448+ return [] # Explicitly return an empty list.
3449+
3450+ if "mlp.experts.mlp.w1" in name :
3451+ data_torch = data_torch .view (self .hparams ["num_experts" ], self .hparams ["n_inner" ], self .hparams ["n_embd" ])
3452+ name += ".weight"
3453+
3454+ if "mlp.experts.mlp.w2" in name :
3455+ data_torch = data_torch .view (self .hparams ["num_experts" ], self .hparams ["n_inner" ], self .hparams ["n_embd" ])
3456+ data_torch = data_torch .transpose (1 , 2 )
3457+ name += ".weight"
3458+
3459+ return [(self .map_tensor_name (name ), data_torch )]
3460+
34463461 def set_gguf_parameters (self ):
34473462 super ().set_gguf_parameters ()
34483463 self .gguf_writer .add_rope_freq_base (self .hparams ["rotary_emb_base" ])
3464+ if self .is_moe :
3465+ self .gguf_writer .add_moe_every_n_layers (self .hparams ["moe_every_n_layers" ])
3466+ self .gguf_writer .add_expert_count (self .hparams ["num_experts" ])
3467+ self .gguf_writer .add_expert_used_count (self .hparams ["moe_top_k" ])
34493468
34503469
34513470@ModelBase .register ("XLMRobertaModel" , "XLMRobertaForSequenceClassification" )
0 commit comments