@@ -802,9 +802,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
802802 if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec" :
803803 # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
804804 res = "seed-coder"
805- if chkhsh == "a81863d07e75497e2194eb1a1574d5e5cd4d5f85a87a0728b922bf2bed6fb327" :
806- # ref: https://huggingface.co/jinaai/jina-embeddings-v3
807- res = "jina-v3"
808805
809806 if res is None :
810807 logger .warning ("\n " )
@@ -3626,44 +3623,93 @@ def _xlmroberta_set_vocab(self) -> None:
36263623 from sentencepiece import sentencepiece_model_pb2 as model
36273624
36283625 tokenizer_path = self .dir_model / 'sentencepiece.bpe.model'
3626+
3627+ tokenizer_json = {}
3628+ tokenizer_config_json = {}
36293629 if not tokenizer_path .is_file ():
3630- raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3630+ tokenizer_path = self .dir_model / 'tokenizer.json'
3631+ tokenizer_config_path = self .dir_model / 'tokenizer_config.json'
36313632
3632- sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3633- sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3634- assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3633+ if not tokenizer_path .is_file ():
3634+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
36353635
3636- add_prefix = sentencepiece_model . normalizer_spec . add_dummy_prefix
3637- remove_whitespaces = sentencepiece_model . normalizer_spec . remove_extra_whitespaces
3638- precompiled_charsmap = sentencepiece_model . normalizer_spec . precompiled_charsmap
3636+ from base64 import b64decode
3637+ from transformers import AutoTokenizer
3638+ tokenizer = AutoTokenizer . from_pretrained ( self . dir_model )
36393639
3640- tokenizer = SentencePieceProcessor ()
3641- tokenizer . LoadFromFile ( str ( tokenizer_path ) )
3640+ with open ( tokenizer_path , "r" , encoding = "utf-8" ) as fp :
3641+ tokenizer_json = json . load ( fp )
36423642
3643- vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3643+ if tokenizer_config_path .is_file ():
3644+ with open (tokenizer_config_path , "r" , encoding = "utf-8" ) as fp :
3645+ tokenizer_config_json = json .load (fp )
3646+
3647+ add_prefix = tokenizer .add_prefix_space
3648+ remove_whitespaces = tokenizer .clean_up_tokenization_spaces
3649+ precompiled_charsmap = b64decode (tokenizer_json ["normalizer" ]["precompiled_charsmap" ])
3650+
3651+ vocab_size = self .hparams .get ("vocab_size" , tokenizer .vocab_size )
3652+ else :
3653+ sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3654+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3655+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3656+
3657+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
3658+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
3659+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
3660+
3661+ tokenizer = SentencePieceProcessor ()
3662+ tokenizer .LoadFromFile (str (tokenizer_path ))
3663+
3664+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
36443665
36453666 tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
36463667 scores : list [float ] = [- 10000.0 ] * vocab_size
36473668 toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
36483669
3649- for token_id in range (tokenizer .vocab_size ()):
3650- piece = tokenizer .IdToPiece (token_id )
3651- text = piece .encode ("utf-8" )
3652- score = tokenizer .GetScore (token_id )
3670+ if isinstance (tokenizer , SentencePieceProcessor ):
3671+ for token_id in range (vocab_size ):
3672+ piece = tokenizer .IdToPiece (token_id )
3673+ text = piece .encode ("utf-8" )
3674+ score = tokenizer .GetScore (token_id )
36533675
3654- toktype = SentencePieceTokenTypes .NORMAL
3655- if tokenizer .IsUnknown (token_id ):
3656- toktype = SentencePieceTokenTypes .UNKNOWN
3657- elif tokenizer .IsControl (token_id ):
3658- toktype = SentencePieceTokenTypes .CONTROL
3659- elif tokenizer .IsUnused (token_id ):
3660- toktype = SentencePieceTokenTypes .UNUSED
3661- elif tokenizer .IsByte (token_id ):
3662- toktype = SentencePieceTokenTypes .BYTE
3676+ toktype = SentencePieceTokenTypes .NORMAL
3677+ if tokenizer .IsUnknown (token_id ):
3678+ toktype = SentencePieceTokenTypes .UNKNOWN
3679+ elif tokenizer .IsControl (token_id ):
3680+ toktype = SentencePieceTokenTypes .CONTROL
3681+ elif tokenizer .IsUnused (token_id ):
3682+ toktype = SentencePieceTokenTypes .UNUSED
3683+ elif tokenizer .IsByte (token_id ):
3684+ toktype = SentencePieceTokenTypes .BYTE
36633685
3664- tokens [token_id ] = text
3665- scores [token_id ] = score
3666- toktypes [token_id ] = toktype
3686+ tokens [token_id ] = text
3687+ scores [token_id ] = score
3688+ toktypes [token_id ] = toktype
3689+ else :
3690+ added_vocab = tokenizer .get_added_vocab ()
3691+ unk_token = tokenizer_config_json .get ("unk_token" )
3692+ unk_token_id = added_vocab .get (unk_token , 3 )
3693+
3694+ for token_id in range (vocab_size ):
3695+ piece = tokenizer ._convert_id_to_token (token_id )
3696+ text = piece .encode ("utf-8" )
3697+ score = tokenizer_json ["model" ]["vocab" ][token_id ][1 ]
3698+
3699+ toktype = SentencePieceTokenTypes .NORMAL
3700+ if token_id == unk_token_id :
3701+ toktype = SentencePieceTokenTypes .UNKNOWN
3702+ elif token_id in tokenizer .all_special_ids :
3703+ toktype = SentencePieceTokenTypes .CONTROL
3704+ elif token_id in added_vocab .values ():
3705+ toktype = SentencePieceTokenTypes .USER_DEFINED
3706+ # No reliable way to detect this, but jina-embeddings-v3 doesn't have any
3707+ # elif tokenizer.IsByte(token_id):
3708+ # toktype = SentencePieceTokenTypes.BYTE
3709+
3710+ tokens [token_id ] = text
3711+ scores [token_id ] = score
3712+ toktypes [token_id ] = toktype
36673713
36683714 if vocab_size > len (tokens ):
36693715 pad_count = vocab_size - len (tokens )
@@ -3673,15 +3719,16 @@ def _xlmroberta_set_vocab(self) -> None:
36733719 scores .append (- 1000.0 )
36743720 toktypes .append (SentencePieceTokenTypes .UNUSED )
36753721
3676- # realign tokens (see HF tokenizer code)
3677- tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3678- scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3679- toktypes = [
3680- SentencePieceTokenTypes .CONTROL ,
3681- SentencePieceTokenTypes .CONTROL ,
3682- SentencePieceTokenTypes .CONTROL ,
3683- SentencePieceTokenTypes .UNKNOWN ,
3684- ] + toktypes [3 :- 1 ]
3722+ if isinstance (tokenizer , SentencePieceProcessor ):
3723+ # realign tokens (see HF tokenizer code)
3724+ tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3725+ scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3726+ toktypes = [
3727+ SentencePieceTokenTypes .CONTROL ,
3728+ SentencePieceTokenTypes .CONTROL ,
3729+ SentencePieceTokenTypes .CONTROL ,
3730+ SentencePieceTokenTypes .UNKNOWN ,
3731+ ] + toktypes [3 :- 1 ]
36853732
36863733 self .gguf_writer .add_tokenizer_model ("t5" )
36873734 self .gguf_writer .add_tokenizer_pre ("default" )
@@ -3841,15 +3888,10 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
38413888 self .model_arch = gguf .MODEL_ARCH .JINA_BERT_V3
38423889
38433890 super ().__init__ (dir_model , ftype , fname_out , hparams = hparams , ** kwargs )
3844-
3845- self ._tokenizer_is_xlmroberta = False if self .model_arch == gguf .MODEL_ARCH .JINA_BERT_V3 else True
3846- if self ._tokenizer_is_xlmroberta :
3847- self ._xlmroberta_tokenizer_init ()
3891+ self ._xlmroberta_tokenizer_init ()
38483892
38493893 def set_vocab (self ):
3850- if self ._tokenizer_is_xlmroberta :
3851- return self ._xlmroberta_set_vocab ()
3852- return super ().set_vocab ()
3894+ self ._xlmroberta_set_vocab ()
38533895
38543896 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
38553897 # if name starts with "roberta.", remove the prefix
0 commit comments