@@ -695,8 +695,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
695695 if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e" :
696696 # ref: https://huggingface.co/Xenova/gpt-4o
697697 res = "gpt-4o"
698- if chkhsh == "a81863d07e75497e2194eb1a1574d5e5cd4d5f85a87a0728b922bf2bed6fb327" :
699- res = "bert"
700698
701699 if res is None :
702700 logger .warning ("\n " )
@@ -3088,6 +3086,97 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
30883086
30893087 return [(self .map_tensor_name (name ), data_torch )]
30903088
3089+ def _xlmroberta_tokenizer_init (self ) -> None :
3090+ # we need the pad_token_id to know how to chop down position_embd matrix
3091+ if (pad_token_id := self .hparams .get ("pad_token_id" )) is not None :
3092+ self ._position_offset = 1 + pad_token_id
3093+ if "max_position_embeddings" in self .hparams :
3094+ self .hparams ["max_position_embeddings" ] -= self ._position_offset
3095+ else :
3096+ self ._position_offset = None
3097+
3098+ def _xlmroberta_set_vocab (self ) -> None :
3099+ # to avoid TypeError: Descriptors cannot be created directly
3100+ # exception when importing sentencepiece_model_pb2
3101+ os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
3102+ from sentencepiece import SentencePieceProcessor
3103+ from sentencepiece import sentencepiece_model_pb2 as model
3104+
3105+ tokenizer_path = self .dir_model / 'sentencepiece.bpe.model'
3106+ if not tokenizer_path .is_file ():
3107+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3108+
3109+ sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3110+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3111+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3112+
3113+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
3114+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
3115+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
3116+
3117+ tokenizer = SentencePieceProcessor ()
3118+ tokenizer .LoadFromFile (str (tokenizer_path ))
3119+
3120+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3121+
3122+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3123+ scores : list [float ] = [- 10000.0 ] * vocab_size
3124+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3125+
3126+ for token_id in range (tokenizer .vocab_size ()):
3127+ piece = tokenizer .IdToPiece (token_id )
3128+ text = piece .encode ("utf-8" )
3129+ score = tokenizer .GetScore (token_id )
3130+
3131+ toktype = SentencePieceTokenTypes .NORMAL
3132+ if tokenizer .IsUnknown (token_id ):
3133+ toktype = SentencePieceTokenTypes .UNKNOWN
3134+ elif tokenizer .IsControl (token_id ):
3135+ toktype = SentencePieceTokenTypes .CONTROL
3136+ elif tokenizer .IsUnused (token_id ):
3137+ toktype = SentencePieceTokenTypes .UNUSED
3138+ elif tokenizer .IsByte (token_id ):
3139+ toktype = SentencePieceTokenTypes .BYTE
3140+
3141+ tokens [token_id ] = text
3142+ scores [token_id ] = score
3143+ toktypes [token_id ] = toktype
3144+
3145+ if vocab_size > len (tokens ):
3146+ pad_count = vocab_size - len (tokens )
3147+ logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3148+ for i in range (1 , pad_count + 1 ):
3149+ tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3150+ scores .append (- 1000.0 )
3151+ toktypes .append (SentencePieceTokenTypes .UNUSED )
3152+
3153+ # realign tokens (see HF tokenizer code)
3154+ tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3155+ scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3156+ toktypes = [
3157+ SentencePieceTokenTypes .CONTROL ,
3158+ SentencePieceTokenTypes .CONTROL ,
3159+ SentencePieceTokenTypes .CONTROL ,
3160+ SentencePieceTokenTypes .UNKNOWN ,
3161+ ] + toktypes [3 :- 1 ]
3162+
3163+ self .gguf_writer .add_tokenizer_model ("t5" )
3164+ self .gguf_writer .add_tokenizer_pre ("default" )
3165+ self .gguf_writer .add_token_list (tokens )
3166+ self .gguf_writer .add_token_scores (scores )
3167+ self .gguf_writer .add_token_types (toktypes )
3168+ self .gguf_writer .add_add_space_prefix (add_prefix )
3169+ self .gguf_writer .add_token_type_count (self .hparams .get ("type_vocab_size" , 1 ))
3170+ self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
3171+ if precompiled_charsmap :
3172+ self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
3173+
3174+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
3175+ special_vocab .add_to_gguf (self .gguf_writer )
3176+
3177+ self .gguf_writer .add_add_bos_token (True )
3178+ self .gguf_writer .add_add_eos_token (True )
3179+
30913180
30923181@Model .register ("RobertaModel" )
30933182class RobertaModel (BertModel ):
@@ -3154,6 +3243,10 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
31543243 eager , metadata_override , model_name , split_max_tensors ,
31553244 split_max_size , dry_run , small_first_shard , hparams )
31563245
3246+ self ._tokenizer_is_xlmroberta = self ._is_tokenizer_xlmroberta ()
3247+ if self ._tokenizer_is_xlmroberta :
3248+ self ._xlmroberta_tokenizer_init ()
3249+
31573250 # the HF config claims n_ctx=8192, but it uses RoPE scaling
31583251 self .hparams ["n_ctx" ] = 2048
31593252
@@ -3181,6 +3274,21 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
31813274 assert self .hparams ["rotary_emb_interleaved" ] is False
31823275 assert self .hparams ["rotary_emb_scale_base" ] is None
31833276
3277+ def _is_tokenizer_xlmroberta (self ) -> bool :
3278+ with open (self .dir_model / "tokenizer.json" ) as f :
3279+ tokenizer_json = json .load (f )
3280+ toktyp = tokenizer_json ["model" ]["type" ]
3281+ if toktyp == "Unigram" :
3282+ return True
3283+ if toktyp == "WordPiece" :
3284+ return False
3285+ raise ValueError (f"unknown tokenizer: { toktyp } " )
3286+
3287+ def set_vocab (self ) -> None :
3288+ if self ._tokenizer_is_xlmroberta :
3289+ return self ._xlmroberta_set_vocab ()
3290+ return super ().set_vocab ()
3291+
31843292 def modify_tensors (self , data_torch : torch .Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , torch .Tensor ]]:
31853293 # If the tensor is an experts bias tensor, skip it by returning an empty list.
31863294 if "mlp.experts.bias" in name :
@@ -3212,96 +3320,10 @@ class XLMRobertaModel(BertModel):
32123320
32133321 def __init__ (self , * args , ** kwargs ):
32143322 super ().__init__ (* args , ** kwargs )
3323+ self ._xlmroberta_tokenizer_init ()
32153324
3216- # we need the pad_token_id to know how to chop down position_embd matrix
3217- if (pad_token_id := self .hparams .get ("pad_token_id" )) is not None :
3218- self ._position_offset = 1 + pad_token_id
3219- if "max_position_embeddings" in self .hparams :
3220- self .hparams ["max_position_embeddings" ] -= self ._position_offset
3221- else :
3222- self ._position_offset = None
3223-
3224- def set_vocab (self ):
3225- # to avoid TypeError: Descriptors cannot be created directly
3226- # exception when importing sentencepiece_model_pb2
3227- os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
3228- from sentencepiece import SentencePieceProcessor
3229- from sentencepiece import sentencepiece_model_pb2 as model
3230-
3231- tokenizer_path = self .dir_model / 'sentencepiece.bpe.model'
3232- if not tokenizer_path .is_file ():
3233- raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3234-
3235- sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3236- sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3237- assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3238-
3239- add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
3240- remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
3241- precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
3242-
3243- tokenizer = SentencePieceProcessor ()
3244- tokenizer .LoadFromFile (str (tokenizer_path ))
3245-
3246- vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3247-
3248- tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3249- scores : list [float ] = [- 10000.0 ] * vocab_size
3250- toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3251-
3252- for token_id in range (tokenizer .vocab_size ()):
3253- piece = tokenizer .IdToPiece (token_id )
3254- text = piece .encode ("utf-8" )
3255- score = tokenizer .GetScore (token_id )
3256-
3257- toktype = SentencePieceTokenTypes .NORMAL
3258- if tokenizer .IsUnknown (token_id ):
3259- toktype = SentencePieceTokenTypes .UNKNOWN
3260- elif tokenizer .IsControl (token_id ):
3261- toktype = SentencePieceTokenTypes .CONTROL
3262- elif tokenizer .IsUnused (token_id ):
3263- toktype = SentencePieceTokenTypes .UNUSED
3264- elif tokenizer .IsByte (token_id ):
3265- toktype = SentencePieceTokenTypes .BYTE
3266-
3267- tokens [token_id ] = text
3268- scores [token_id ] = score
3269- toktypes [token_id ] = toktype
3270-
3271- if vocab_size > len (tokens ):
3272- pad_count = vocab_size - len (tokens )
3273- logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3274- for i in range (1 , pad_count + 1 ):
3275- tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3276- scores .append (- 1000.0 )
3277- toktypes .append (SentencePieceTokenTypes .UNUSED )
3278-
3279- # realign tokens (see HF tokenizer code)
3280- tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3281- scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3282- toktypes = [
3283- SentencePieceTokenTypes .CONTROL ,
3284- SentencePieceTokenTypes .CONTROL ,
3285- SentencePieceTokenTypes .CONTROL ,
3286- SentencePieceTokenTypes .UNKNOWN ,
3287- ] + toktypes [3 :- 1 ]
3288-
3289- self .gguf_writer .add_tokenizer_model ("t5" )
3290- self .gguf_writer .add_tokenizer_pre ("default" )
3291- self .gguf_writer .add_token_list (tokens )
3292- self .gguf_writer .add_token_scores (scores )
3293- self .gguf_writer .add_token_types (toktypes )
3294- self .gguf_writer .add_add_space_prefix (add_prefix )
3295- self .gguf_writer .add_token_type_count (self .hparams .get ("type_vocab_size" , 1 ))
3296- self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
3297- if precompiled_charsmap :
3298- self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
3299-
3300- special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
3301- special_vocab .add_to_gguf (self .gguf_writer )
3302-
3303- self .gguf_writer .add_add_bos_token (True )
3304- self .gguf_writer .add_add_eos_token (True )
3325+ def set_vocab (self ) -> None :
3326+ self ._xlmroberta_set_vocab ()
33053327
33063328 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
33073329 # if name starts with "roberta.", remove the prefix
0 commit comments