@@ -227,15 +227,14 @@ def _get_part_names(self):
227227 return ("pytorch_model.bin" ,)
228228 return (f"pytorch_model-{ n :05} -of-{ self .num_parts :05} .bin" for n in range (1 , self .num_parts + 1 ))
229229
230- def _set_vocab_gpt2 (self ):
231- dir_model = self .dir_model
232- hparams = self .hparams
230+ # used for GPT-2 BPE and WordPiece vocabs
231+ def get_basic_vocab (self ) -> tuple [list [str ], list [int ]]:
233232 tokens : list [str ] = []
234233 toktypes : list [int ] = []
235234
236235 from transformers import AutoTokenizer
237- tokenizer = AutoTokenizer .from_pretrained (dir_model )
238- vocab_size = hparams .get ("vocab_size" , len (tokenizer .vocab ))
236+ tokenizer = AutoTokenizer .from_pretrained (self . dir_model )
237+ vocab_size = self . hparams .get ("vocab_size" , len (tokenizer .vocab ))
239238 assert max (tokenizer .vocab .values ()) < vocab_size
240239
241240 reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in tokenizer .vocab .items ()}
@@ -255,11 +254,15 @@ def _set_vocab_gpt2(self):
255254 tokens .append (reverse_vocab [i ])
256255 toktypes .append (gguf .TokenType .NORMAL )
257256
257+ return tokens , toktypes
258+
259+ def _set_vocab_gpt2 (self ) -> None :
260+ tokens , toktypes = self .get_basic_vocab ()
258261 self .gguf_writer .add_tokenizer_model ("gpt2" )
259262 self .gguf_writer .add_token_list (tokens )
260263 self .gguf_writer .add_token_types (toktypes )
261264
262- special_vocab = gguf .SpecialVocab (dir_model , load_merges = True )
265+ special_vocab = gguf .SpecialVocab (self . dir_model , load_merges = True )
263266 special_vocab .add_to_gguf (self .gguf_writer )
264267
265268 def _set_vocab_qwen (self ):
@@ -2043,34 +2046,25 @@ def set_gguf_parameters(self):
20432046 self .gguf_writer .add_pooling_type (pooling_type )
20442047
20452048 def set_vocab (self ):
2046- # use huggingface vocab to get all tokens
2047- vocab = LlamaHfVocab (self .dir_model , ignore_nonllama = True )
2048- tokens , scores , toktypes = zip (* vocab .all_tokens ())
2049- assert len (tokens ) == vocab .vocab_size
2050- self .vocab_size = vocab .vocab_size
2049+ tokens , toktypes = self .get_basic_vocab ()
2050+ self .vocab_size = len (tokens )
20512051
20522052 # we need this to validate the size of the token_type embeddings
20532053 # though currently we are passing all zeros to the token_type embeddings
2054- n_token_types = len (set (toktypes ))
2055- self .gguf_writer .add_token_type_count (n_token_types )
2054+ self .gguf_writer .add_token_type_count (2 ) # "Sequence A" or "Sequence B"
20562055
20572056 # convert to phantom space vocab
2058- def phantom (tok , typ ):
2059- if tok .startswith (b "[" ) and tok .endswith (b "]" ):
2057+ def phantom (tok ):
2058+ if tok .startswith ("[" ) and tok .endswith ("]" ):
20602059 return tok
2061- if tok .startswith (b "##" ):
2060+ if tok .startswith ("##" ):
20622061 return tok [2 :]
2063- return b"\xe2 \x96 \x81 " + tok
2064- tokens = tuple (phantom (t , y ) for t , y in zip (tokens , toktypes ))
2065-
2066- # set up bos and eos tokens (cls and sep)
2067- self .gguf_writer .add_bos_token_id (vocab .tokenizer .cls_token_id )
2068- self .gguf_writer .add_eos_token_id (vocab .tokenizer .sep_token_id )
2062+ return "\u2581 " + tok
2063+ tokens = list (map (phantom , tokens ))
20692064
20702065 # add vocab to gguf
20712066 self .gguf_writer .add_tokenizer_model ("bert" )
20722067 self .gguf_writer .add_token_list (tokens )
2073- self .gguf_writer .add_token_scores (scores )
20742068 self .gguf_writer .add_token_types (toktypes )
20752069
20762070 # handle special tokens
@@ -2142,16 +2136,6 @@ def set_gguf_parameters(self):
21422136 super ().set_gguf_parameters ()
21432137 self .gguf_writer .add_rope_freq_base (self .hparams ["rotary_emb_base" ])
21442138
2145- def get_tensors (self ):
2146- assert self .vocab_size is not None
2147- for name , data in super ().get_tensors ():
2148- # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
2149- if name == 'embeddings.word_embeddings.weight' and data .shape [1 ] != self .vocab_size :
2150- rounded_vocab_size = (self .vocab_size + 63 ) // 64 * 64
2151- assert data .shape == (rounded_vocab_size , self .hparams ["n_embd" ])
2152- data = data [:self .vocab_size , :]
2153- yield name , data
2154-
21552139
21562140@Model .register ("GemmaForCausalLM" )
21572141class GemmaModel (Model ):
@@ -2327,7 +2311,8 @@ def write_tensors(self):
23272311 data = data .astype (np .float32 )
23282312
23292313 # if f16 desired, convert big float32 2-dim weight tensors to float16
2330- if self .ftype == 1 and data_dtype == np .float32 and new_name .removesuffix (".weight" ).endswith ((".ssm_in" , ".ssm_out" , "token_embd" , "output" )) and n_dims == 2 :
2314+ new_weight_name = new_name [:- len (".weight" )] if new_name .endswith (".weight" ) else ""
2315+ if self .ftype == 1 and data_dtype == np .float32 and new_weight_name .endswith ((".ssm_in" , ".ssm_out" , "token_embd" , "output" )) and n_dims == 2 :
23312316 data = data .astype (np .float16 )
23322317
23332318 print (f"{ new_name } , n_dims = { n_dims } , { old_dtype } --> { data .dtype } " )
0 commit comments