@@ -3782,44 +3782,93 @@ def _xlmroberta_set_vocab(self) -> None:
37823782 from sentencepiece import sentencepiece_model_pb2 as model
37833783
37843784 tokenizer_path = self .dir_model / 'sentencepiece.bpe.model'
3785+
3786+ tokenizer_json = {}
3787+ tokenizer_config_json = {}
37853788 if not tokenizer_path .is_file ():
3786- raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3789+ tokenizer_path = self .dir_model / 'tokenizer.json'
3790+ tokenizer_config_path = self .dir_model / 'tokenizer_config.json'
37873791
3788- sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3789- sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3790- assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3792+ if not tokenizer_path .is_file ():
3793+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
37913794
3792- add_prefix = sentencepiece_model . normalizer_spec . add_dummy_prefix
3793- remove_whitespaces = sentencepiece_model . normalizer_spec . remove_extra_whitespaces
3794- precompiled_charsmap = sentencepiece_model . normalizer_spec . precompiled_charsmap
3795+ from base64 import b64decode
3796+ from transformers import AutoTokenizer
3797+ tokenizer = AutoTokenizer . from_pretrained ( self . dir_model )
37953798
3796- tokenizer = SentencePieceProcessor ()
3797- tokenizer . LoadFromFile ( str ( tokenizer_path ) )
3799+ with open ( tokenizer_path , "r" , encoding = "utf-8" ) as fp :
3800+ tokenizer_json = json . load ( fp )
37983801
3799- vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3802+ if tokenizer_config_path .is_file ():
3803+ with open (tokenizer_config_path , "r" , encoding = "utf-8" ) as fp :
3804+ tokenizer_config_json = json .load (fp )
3805+
3806+ add_prefix = tokenizer .add_prefix_space
3807+ remove_whitespaces = tokenizer .clean_up_tokenization_spaces
3808+ precompiled_charsmap = b64decode (tokenizer_json ["normalizer" ]["precompiled_charsmap" ])
3809+
3810+ vocab_size = self .hparams .get ("vocab_size" , tokenizer .vocab_size )
3811+ else :
3812+ sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3813+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3814+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3815+
3816+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
3817+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
3818+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
3819+
3820+ tokenizer = SentencePieceProcessor ()
3821+ tokenizer .LoadFromFile (str (tokenizer_path ))
3822+
3823+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
38003824
38013825 tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
38023826 scores : list [float ] = [- 10000.0 ] * vocab_size
38033827 toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
38043828
3805- for token_id in range (tokenizer .vocab_size ()):
3806- piece = tokenizer .IdToPiece (token_id )
3807- text = piece .encode ("utf-8" )
3808- score = tokenizer .GetScore (token_id )
3829+ if isinstance (tokenizer , SentencePieceProcessor ):
3830+ for token_id in range (tokenizer .vocab_size ()):
3831+ piece = tokenizer .IdToPiece (token_id )
3832+ text = piece .encode ("utf-8" )
3833+ score = tokenizer .GetScore (token_id )
38093834
3810- toktype = SentencePieceTokenTypes .NORMAL
3811- if tokenizer .IsUnknown (token_id ):
3812- toktype = SentencePieceTokenTypes .UNKNOWN
3813- elif tokenizer .IsControl (token_id ):
3814- toktype = SentencePieceTokenTypes .CONTROL
3815- elif tokenizer .IsUnused (token_id ):
3816- toktype = SentencePieceTokenTypes .UNUSED
3817- elif tokenizer .IsByte (token_id ):
3818- toktype = SentencePieceTokenTypes .BYTE
3835+ toktype = SentencePieceTokenTypes .NORMAL
3836+ if tokenizer .IsUnknown (token_id ):
3837+ toktype = SentencePieceTokenTypes .UNKNOWN
3838+ elif tokenizer .IsControl (token_id ):
3839+ toktype = SentencePieceTokenTypes .CONTROL
3840+ elif tokenizer .IsUnused (token_id ):
3841+ toktype = SentencePieceTokenTypes .UNUSED
3842+ elif tokenizer .IsByte (token_id ):
3843+ toktype = SentencePieceTokenTypes .BYTE
38193844
3820- tokens [token_id ] = text
3821- scores [token_id ] = score
3822- toktypes [token_id ] = toktype
3845+ tokens [token_id ] = text
3846+ scores [token_id ] = score
3847+ toktypes [token_id ] = toktype
3848+ else :
3849+ added_vocab = tokenizer .get_added_vocab ()
3850+ unk_token = tokenizer_config_json .get ("unk_token" )
3851+ unk_token_id = added_vocab .get (unk_token , tokenizer_json ["model" ].get ("unk_id" , 3 ))
3852+
3853+ for token_id in range (vocab_size ):
3854+ piece = tokenizer ._convert_id_to_token (token_id )
3855+ text = piece .encode ("utf-8" )
3856+ score = tokenizer_json ["model" ]["vocab" ][token_id ][1 ]
3857+
3858+ toktype = SentencePieceTokenTypes .NORMAL
3859+ if token_id == unk_token_id :
3860+ toktype = SentencePieceTokenTypes .UNKNOWN
3861+ elif token_id in tokenizer .all_special_ids :
3862+ toktype = SentencePieceTokenTypes .CONTROL
3863+ elif token_id in added_vocab .values ():
3864+ toktype = SentencePieceTokenTypes .USER_DEFINED
3865+ # No reliable way to detect this, but jina doesn't have any
3866+ # elif tokenizer.IsByte(token_id):
3867+ # toktype = SentencePieceTokenTypes.BYTE
3868+
3869+ tokens [token_id ] = text
3870+ scores [token_id ] = score
3871+ toktypes [token_id ] = toktype
38233872
38243873 if vocab_size > len (tokens ):
38253874 pad_count = vocab_size - len (tokens )
@@ -3829,15 +3878,16 @@ def _xlmroberta_set_vocab(self) -> None:
38293878 scores .append (- 1000.0 )
38303879 toktypes .append (SentencePieceTokenTypes .UNUSED )
38313880
3832- # realign tokens (see HF tokenizer code)
3833- tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3834- scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3835- toktypes = [
3836- SentencePieceTokenTypes .CONTROL ,
3837- SentencePieceTokenTypes .CONTROL ,
3838- SentencePieceTokenTypes .CONTROL ,
3839- SentencePieceTokenTypes .UNKNOWN ,
3840- ] + toktypes [3 :- 1 ]
3881+ if isinstance (tokenizer , SentencePieceProcessor ):
3882+ # realign tokens (see HF tokenizer code)
3883+ tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3884+ scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3885+ toktypes = [
3886+ SentencePieceTokenTypes .CONTROL ,
3887+ SentencePieceTokenTypes .CONTROL ,
3888+ SentencePieceTokenTypes .CONTROL ,
3889+ SentencePieceTokenTypes .UNKNOWN ,
3890+ ] + toktypes [3 :- 1 ]
38413891
38423892 self .gguf_writer .add_tokenizer_model ("t5" )
38433893 self .gguf_writer .add_tokenizer_pre ("default" )
0 commit comments