@@ -2200,6 +2200,15 @@ class Phi3MiniModel(Model):
22002200 model_arch = gguf .MODEL_ARCH .PHI3
22012201
22022202 def set_vocab (self ):
2203+ # Phi-4 model uses GPT2Tokenizer
2204+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
2205+ if tokenizer_config_file .is_file ():
2206+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
2207+ tokenizer_config_json = json .load (f )
2208+ tokenizer_class = tokenizer_config_json ['tokenizer_class' ]
2209+ if tokenizer_class == 'GPT2Tokenizer' :
2210+ return self ._set_vocab_gpt2 ()
2211+
22032212 from sentencepiece import SentencePieceProcessor
22042213
22052214 tokenizer_path = self .dir_model / 'tokenizer.model'
@@ -2316,7 +2325,11 @@ def set_gguf_parameters(self):
23162325 self .gguf_writer .add_rope_dimension_count (rope_dims )
23172326 self .gguf_writer .add_rope_freq_base (self .find_hparam (["rope_theta" ]))
23182327 self .gguf_writer .add_file_type (self .ftype )
2319- self .gguf_writer .add_sliding_window (self .find_hparam (["sliding_window" ]))
2328+ sliding_window = self .hparams .get ("sliding_window" )
2329+ # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
2330+ if sliding_window is None :
2331+ sliding_window = 0
2332+ self .gguf_writer .add_sliding_window (sliding_window )
23202333
23212334 def generate_extra_tensors (self ) -> Iterable [tuple [str , Tensor ]]:
23222335 n_embd = self .find_hparam (["hidden_size" , "n_embd" ])
@@ -2615,7 +2628,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26152628 return [(self .map_tensor_name (name ), data_torch )]
26162629
26172630
2618- @Model .register ("BertModel" , "CamembertModel" , "RobertaModel" )
2631+ @Model .register ("BertModel" , "CamembertModel" )
26192632class BertModel (Model ):
26202633 model_arch = gguf .MODEL_ARCH .BERT
26212634
@@ -2688,6 +2701,51 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26882701 return [(self .map_tensor_name (name ), data_torch )]
26892702
26902703
2704+ @Model .register ("RobertaModel" )
2705+ class RobertaModel (BertModel ):
2706+ model_arch = gguf .MODEL_ARCH .BERT
2707+
2708+ def __init__ (self , * args , ** kwargs ):
2709+ super ().__init__ (* args , ** kwargs )
2710+
2711+ # we need the pad_token_id to know how to chop down position_embd matrix
2712+ if (pad_token_id := self .hparams .get ("pad_token_id" )) is not None :
2713+ self ._position_offset = 1 + pad_token_id
2714+ if "max_position_embeddings" in self .hparams :
2715+ self .hparams ["max_position_embeddings" ] -= self ._position_offset
2716+ else :
2717+ self ._position_offset = None
2718+
2719+ def set_vocab (self ):
2720+ """Support BPE tokenizers for roberta models"""
2721+ bpe_tok_path = self .dir_model / "tokenizer.json"
2722+ if bpe_tok_path .exists ():
2723+ self ._set_vocab_gpt2 ()
2724+ self .gguf_writer .add_add_bos_token (True )
2725+ self .gguf_writer .add_add_eos_token (True )
2726+
2727+ # we need this to validate the size of the token_type embeddings
2728+ # though currently we are passing all zeros to the token_type embeddings
2729+ # "Sequence A" or "Sequence B"
2730+ self .gguf_writer .add_token_type_count (self .hparams .get ("type_vocab_size" , 1 ))
2731+
2732+ else :
2733+ return super ().set_vocab ()
2734+
2735+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2736+ # if name starts with "roberta.", remove the prefix
2737+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
2738+ if name .startswith ("roberta." ):
2739+ name = name [8 :]
2740+
2741+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
2742+ if name == "embeddings.position_embeddings.weight" :
2743+ if self ._position_offset is not None :
2744+ data_torch = data_torch [self ._position_offset :,:]
2745+
2746+ return super ().modify_tensors (data_torch , name , bid )
2747+
2748+
26912749@Model .register ("NomicBertModel" )
26922750class NomicBertModel (BertModel ):
26932751 model_arch = gguf .MODEL_ARCH .NOMIC_BERT
@@ -3007,6 +3065,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
30073065 if new_name .endswith ("time_mix_w2.weight" ):
30083066 data_torch = data_torch .permute (0 , 2 , 1 )
30093067
3068+ if new_name .endswith ("time_mix_decay.weight" ) or "lerp" in new_name :
3069+ data_torch = data_torch .squeeze ()
3070+
30103071 rescale_every_n_layers = self .hparams ["rescale_every" ]
30113072 if rescale_every_n_layers > 0 :
30123073 if new_name .endswith ("time_mix_output.weight" ) or new_name .endswith ("channel_mix_value.weight" ):
0 commit comments