@@ -3665,7 +3665,7 @@ def _xlmroberta_set_vocab(self) -> None:
36653665 tokenizer = SentencePieceProcessor ()
36663666 tokenizer .LoadFromFile (str (tokenizer_path ))
36673667
3668- vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3668+ vocab_size = max ( self .hparams .get ('vocab_size' , 0 ) , tokenizer .vocab_size ())
36693669
36703670 tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
36713671 scores : list [float ] = [- 10000.0 ] * vocab_size
@@ -3690,14 +3690,6 @@ def _xlmroberta_set_vocab(self) -> None:
36903690 scores [token_id ] = score
36913691 toktypes [token_id ] = toktype
36923692
3693- if vocab_size > len (tokens ):
3694- pad_count = vocab_size - len (tokens )
3695- logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3696- for i in range (1 , pad_count + 1 ):
3697- tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3698- scores .append (- 1000.0 )
3699- toktypes .append (SentencePieceTokenTypes .UNUSED )
3700-
37013693 # realign tokens (see HF tokenizer code)
37023694 tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
37033695 scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
@@ -3708,6 +3700,12 @@ def _xlmroberta_set_vocab(self) -> None:
37083700 SentencePieceTokenTypes .UNKNOWN ,
37093701 ] + toktypes [3 :- 1 ]
37103702
3703+ if self .model_arch == gguf .MODEL_ARCH .NOMIC_BERT_MOE :
3704+ # Add mask token missing from sentencepiece.bpe.model
3705+ tokens [250001 ] = "<mask>"
3706+ scores [250001 ] = 0.0
3707+ toktypes [250001 ] = SentencePieceTokenTypes .CONTROL
3708+
37113709 self .gguf_writer .add_tokenizer_model ("t5" )
37123710 self .gguf_writer .add_tokenizer_pre ("default" )
37133711 self .gguf_writer .add_token_list (tokens )
0 commit comments