@@ -573,6 +573,10 @@ def _set_vocab_sentencepiece(self):
573573
574574 vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
575575
576+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
577+ scores : list [float ] = [- 10000.0 ] * vocab_size
578+ toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
579+
576580 for token_id in range (tokenizer .vocab_size ()):
577581 piece = tokenizer .IdToPiece (token_id )
578582 text = piece .encode ("utf-8" )
@@ -588,21 +592,23 @@ def _set_vocab_sentencepiece(self):
588592 elif tokenizer .IsByte (token_id ):
589593 toktype = SentencePieceTokenTypes .BYTE
590594
591- tokens . append ( text )
592- scores . append ( score )
593- toktypes . append ( toktype )
595+ tokens [ token_id ] = text
596+ scores [ token_id ] = score
597+ toktypes [ token_id ] = toktype
594598
595599 added_tokens_file = self .dir_model / 'added_tokens.json'
596600 if added_tokens_file .is_file ():
597601 with open (added_tokens_file , "r" , encoding = "utf-8" ) as f :
598602 added_tokens_json = json .load (f )
599-
600603 for key in added_tokens_json :
601- key = key .encode ("utf-8" )
602- if key not in tokens :
603- tokens .append (key )
604- scores .append (- 1000.0 )
605- toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
604+ token_id = added_tokens_json [key ]
605+ if (token_id >= vocab_size ):
606+ logger .warning (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
607+ continue
608+
609+ tokens [token_id ] = key .encode ("utf-8" )
610+ scores [token_id ] = - 1000.0
611+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
606612
607613 if vocab_size > len (tokens ):
608614 pad_count = vocab_size - len (tokens )
@@ -612,8 +618,6 @@ def _set_vocab_sentencepiece(self):
612618 scores .append (- 1000.0 )
613619 toktypes .append (SentencePieceTokenTypes .UNUSED )
614620
615- assert len (tokens ) == vocab_size
616-
617621 self .gguf_writer .add_tokenizer_model ("llama" )
618622 self .gguf_writer .add_tokenizer_pre ("default" )
619623 self .gguf_writer .add_token_list (tokens )
0 commit comments