File tree Expand file tree Collapse file tree 1 file changed +8
-4
lines changed Expand file tree Collapse file tree 1 file changed +8
-4
lines changed Original file line number Diff line number Diff line change @@ -331,7 +331,7 @@ def _set_vocab_sentencepiece(self):
331331 tokenizer = SentencePieceProcessor (str (tokenizer_path ))
332332 vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
333333
334- for token_id in range (vocab_size ):
334+ for token_id in range (tokenizer . vocab_size () ):
335335 piece = tokenizer .id_to_piece (token_id )
336336 text = piece .encode ("utf-8" )
337337 score = tokenizer .get_score (token_id )
@@ -356,9 +356,13 @@ def _set_vocab_sentencepiece(self):
356356 added_tokens_json = json .load (f )
357357
358358 for key in added_tokens_json :
359- tokens .append (key .encode ("utf-8" ))
360- scores .append (- 1000.0 )
361- toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
359+ key = key .encode ("utf-8" )
360+ if key not in tokens :
361+ tokens .append (key )
362+ scores .append (- 1000.0 )
363+ toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
364+
365+ assert len (tokens ) == vocab_size
362366
363367 self .gguf_writer .add_tokenizer_model ("llama" )
364368 self .gguf_writer .add_token_list (tokens )
You can’t perform that action at this time.
0 commit comments