@@ -3807,7 +3807,7 @@ def _xlmroberta_set_vocab(self) -> None:
38073807 remove_whitespaces = tokenizer .clean_up_tokenization_spaces
38083808 precompiled_charsmap = b64decode (tokenizer_json ["normalizer" ]["precompiled_charsmap" ])
38093809
3810- vocab_size = self .hparams .get (" vocab_size" , tokenizer .vocab_size )
3810+ vocab_size = max ( self .hparams .get (' vocab_size' , 0 ) , tokenizer .vocab_size )
38113811 else :
38123812 sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
38133813 sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
@@ -3851,24 +3851,24 @@ def _xlmroberta_set_vocab(self) -> None:
38513851 unk_token_id = added_vocab .get (unk_token , tokenizer_json ["model" ].get ("unk_id" , 3 ))
38523852
38533853 for token_id in range (vocab_size ):
3854- piece = tokenizer ._convert_id_to_token (token_id )
3855- text = piece .encode ("utf-8" )
3856- score = tokenizer_json ["model" ]["vocab" ][token_id ][1 ]
3857-
3858- toktype = SentencePieceTokenTypes .NORMAL
3859- if token_id == unk_token_id :
3860- toktype = SentencePieceTokenTypes .UNKNOWN
3861- elif token_id in tokenizer .all_special_ids :
3862- toktype = SentencePieceTokenTypes .CONTROL
3863- elif token_id in added_vocab .values ():
3864- toktype = SentencePieceTokenTypes .USER_DEFINED
3865- # No reliable way to detect this, but jina doesn't have any
3866- # elif tokenizer.IsByte(token_id):
3867- # toktype = SentencePieceTokenTypes.BYTE
3868-
3869- tokens [token_id ] = text
3870- scores [token_id ] = score
3871- toktypes [token_id ] = toktype
3854+ if ( piece : = tokenizer ._convert_id_to_token (token_id )) is not None :
3855+ text = piece .encode ("utf-8" )
3856+ score = tokenizer_json ["model" ]["vocab" ][token_id ][1 ]
3857+
3858+ toktype = SentencePieceTokenTypes .NORMAL
3859+ if token_id == unk_token_id :
3860+ toktype = SentencePieceTokenTypes .UNKNOWN
3861+ elif token_id in tokenizer .all_special_ids :
3862+ toktype = SentencePieceTokenTypes .CONTROL
3863+ elif token_id in added_vocab .values ():
3864+ toktype = SentencePieceTokenTypes .USER_DEFINED
3865+ # No reliable way to detect this, but jina doesn't have any
3866+ # elif tokenizer.IsByte(token_id):
3867+ # toktype = SentencePieceTokenTypes.BYTE
3868+
3869+ tokens [token_id ] = text
3870+ scores [token_id ] = score
3871+ toktypes [token_id ] = toktype
38723872
38733873 if isinstance (tokenizer , SentencePieceProcessor ):
38743874 # realign tokens (see HF tokenizer code)
0 commit comments