@@ -3814,7 +3814,7 @@ def _xlmroberta_set_vocab(self) -> None:
38143814 remove_whitespaces = tokenizer .clean_up_tokenization_spaces
38153815 precompiled_charsmap = b64decode (tokenizer_json ["normalizer" ]["precompiled_charsmap" ])
38163816
3817- vocab_size = self .hparams .get ("vocab_size" , tokenizer .vocab_size )
3817+ vocab_size = max ( self .hparams .get ("vocab_size" , 0 ) , tokenizer .vocab_size )
38183818 else :
38193819 sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
38203820 sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
@@ -3827,7 +3827,7 @@ def _xlmroberta_set_vocab(self) -> None:
38273827 tokenizer = SentencePieceProcessor ()
38283828 tokenizer .LoadFromFile (str (tokenizer_path ))
38293829
3830- vocab_size = self .hparams .get (' vocab_size' , tokenizer .vocab_size ())
3830+ vocab_size = max ( self .hparams .get (" vocab_size" , 0 ) , tokenizer .vocab_size ())
38313831
38323832 tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
38333833 scores : list [float ] = [- 10000.0 ] * vocab_size
@@ -3857,33 +3857,26 @@ def _xlmroberta_set_vocab(self) -> None:
38573857 unk_token = tokenizer_config_json .get ("unk_token" )
38583858 unk_token_id = added_vocab .get (unk_token , tokenizer_json ["model" ].get ("unk_id" , 3 ))
38593859
3860- for token_id in range (vocab_size ):
3860+ for token_id in range (tokenizer . vocab_size ):
38613861 piece = tokenizer ._convert_id_to_token (token_id )
3862- text = piece .encode ("utf-8" )
3863- score = tokenizer_json ["model" ]["vocab" ][token_id ][1 ]
3864-
3865- toktype = SentencePieceTokenTypes .NORMAL
3866- if token_id == unk_token_id :
3867- toktype = SentencePieceTokenTypes .UNKNOWN
3868- elif token_id in tokenizer .all_special_ids :
3869- toktype = SentencePieceTokenTypes .CONTROL
3870- elif token_id in added_vocab .values ():
3871- toktype = SentencePieceTokenTypes .USER_DEFINED
3872- # No reliable way to detect this, but jina doesn't have any
3873- # elif tokenizer.IsByte(token_id):
3874- # toktype = SentencePieceTokenTypes.BYTE
3875-
3876- tokens [token_id ] = text
3877- scores [token_id ] = score
3878- toktypes [token_id ] = toktype
3879-
3880- if vocab_size > len (tokens ):
3881- pad_count = vocab_size - len (tokens )
3882- logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3883- for i in range (1 , pad_count + 1 ):
3884- tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3885- scores .append (- 1000.0 )
3886- toktypes .append (SentencePieceTokenTypes .UNUSED )
3862+ if (piece := tokenizer ._convert_id_to_token (token_id )) is not None :
3863+ text = piece .encode ("utf-8" )
3864+ score = tokenizer_json ["model" ]["vocab" ][token_id ][1 ]
3865+
3866+ toktype = SentencePieceTokenTypes .NORMAL
3867+ if token_id == unk_token_id :
3868+ toktype = SentencePieceTokenTypes .UNKNOWN
3869+ elif token_id in tokenizer .all_special_ids :
3870+ toktype = SentencePieceTokenTypes .CONTROL
3871+ elif token_id in added_vocab .values ():
3872+ toktype = SentencePieceTokenTypes .USER_DEFINED
3873+ # No reliable way to detect this, but jina doesn't have any
3874+ # elif tokenizer.IsByte(token_id):
3875+ # toktype = SentencePieceTokenTypes.BYTE
3876+
3877+ tokens [token_id ] = text
3878+ scores [token_id ] = score
3879+ toktypes [token_id ] = toktype
38873880
38883881 if isinstance (tokenizer , SentencePieceProcessor ):
38893882 # realign tokens (see HF tokenizer code)
0 commit comments