@@ -2378,6 +2378,15 @@ class Phi3MiniModel(Model):
23782378 model_arch = gguf .MODEL_ARCH .PHI3
23792379
23802380 def set_vocab (self ):
2381+ # Phi-4 model uses GPT2Tokenizer
2382+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
2383+ if tokenizer_config_file .is_file ():
2384+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
2385+ tokenizer_config_json = json .load (f )
2386+ tokenizer_class = tokenizer_config_json ['tokenizer_class' ]
2387+ if tokenizer_class == 'GPT2Tokenizer' :
2388+ return self ._set_vocab_gpt2 ()
2389+
23812390 from sentencepiece import SentencePieceProcessor
23822391
23832392 tokenizer_path = self .dir_model / 'tokenizer.model'
@@ -2494,7 +2503,11 @@ def set_gguf_parameters(self):
24942503 self .gguf_writer .add_rope_dimension_count (rope_dims )
24952504 self .gguf_writer .add_rope_freq_base (self .find_hparam (["rope_theta" ]))
24962505 self .gguf_writer .add_file_type (self .ftype )
2497- self .gguf_writer .add_sliding_window (self .find_hparam (["sliding_window" ]))
2506+ sliding_window = self .hparams .get ("sliding_window" )
2507+ # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
2508+ if sliding_window is None :
2509+ sliding_window = 0
2510+ self .gguf_writer .add_sliding_window (sliding_window )
24982511
24992512 def generate_extra_tensors (self ) -> Iterable [tuple [str , Tensor ]]:
25002513 n_embd = self .find_hparam (["hidden_size" , "n_embd" ])
@@ -2793,7 +2806,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27932806 return [(self .map_tensor_name (name ), data_torch )]
27942807
27952808
2796- @Model .register ("BertModel" , "CamembertModel " , "RobertaModel " )
2809+ @Model .register ("BertModel" , "BertForMaskedLM " , "CamembertModel " )
27972810class BertModel (Model ):
27982811 model_arch = gguf .MODEL_ARCH .BERT
27992812
@@ -2859,13 +2872,73 @@ def phantom(tok):
28592872 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
28602873 del bid # unused
28612874
2875+ if name .startswith ("bert." ):
2876+ name = name [5 :]
2877+
2878+ if name .endswith (".gamma" ):
2879+ name = name [:- 6 ] + ".weight"
2880+
2881+ if name .endswith (".beta" ):
2882+ name = name [:- 5 ] + ".bias"
2883+
28622884 # we are only using BERT for embeddings so we don't need the pooling layer
28632885 if name in ("embeddings.position_ids" , "pooler.dense.weight" , "pooler.dense.bias" ):
28642886 return [] # we don't need these
28652887
2888+ if name .startswith ("cls.predictions" ):
2889+ return []
2890+
2891+ if name .startswith ("cls.seq_relationship" ):
2892+ return []
2893+
28662894 return [(self .map_tensor_name (name ), data_torch )]
28672895
28682896
2897+ @Model .register ("RobertaModel" )
2898+ class RobertaModel (BertModel ):
2899+ model_arch = gguf .MODEL_ARCH .BERT
2900+
2901+ def __init__ (self , * args , ** kwargs ):
2902+ super ().__init__ (* args , ** kwargs )
2903+
2904+ # we need the pad_token_id to know how to chop down position_embd matrix
2905+ if (pad_token_id := self .hparams .get ("pad_token_id" )) is not None :
2906+ self ._position_offset = 1 + pad_token_id
2907+ if "max_position_embeddings" in self .hparams :
2908+ self .hparams ["max_position_embeddings" ] -= self ._position_offset
2909+ else :
2910+ self ._position_offset = None
2911+
2912+ def set_vocab (self ):
2913+ """Support BPE tokenizers for roberta models"""
2914+ bpe_tok_path = self .dir_model / "tokenizer.json"
2915+ if bpe_tok_path .exists ():
2916+ self ._set_vocab_gpt2 ()
2917+ self .gguf_writer .add_add_bos_token (True )
2918+ self .gguf_writer .add_add_eos_token (True )
2919+
2920+ # we need this to validate the size of the token_type embeddings
2921+ # though currently we are passing all zeros to the token_type embeddings
2922+ # "Sequence A" or "Sequence B"
2923+ self .gguf_writer .add_token_type_count (self .hparams .get ("type_vocab_size" , 1 ))
2924+
2925+ else :
2926+ return super ().set_vocab ()
2927+
2928+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2929+ # if name starts with "roberta.", remove the prefix
2930+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
2931+ if name .startswith ("roberta." ):
2932+ name = name [8 :]
2933+
2934+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
2935+ if name == "embeddings.position_embeddings.weight" :
2936+ if self ._position_offset is not None :
2937+ data_torch = data_torch [self ._position_offset :,:]
2938+
2939+ return super ().modify_tensors (data_torch , name , bid )
2940+
2941+
28692942@Model .register ("NomicBertModel" )
28702943class NomicBertModel (BertModel ):
28712944 model_arch = gguf .MODEL_ARCH .NOMIC_BERT
@@ -3185,6 +3258,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
31853258 if new_name .endswith ("time_mix_w2.weight" ):
31863259 data_torch = data_torch .permute (0 , 2 , 1 )
31873260
3261+ if new_name .endswith ("time_mix_decay.weight" ) or "lerp" in new_name :
3262+ data_torch = data_torch .squeeze ()
3263+
31883264 rescale_every_n_layers = self .hparams ["rescale_every" ]
31893265 if rescale_every_n_layers > 0 :
31903266 if new_name .endswith ("time_mix_output.weight" ) or new_name .endswith ("channel_mix_value.weight" ):
0 commit comments