@@ -3365,6 +3365,97 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
33653365
33663366 return [(self .map_tensor_name (name ), data_torch )]
33673367
3368+ def _xlmroberta_tokenizer_init (self ) -> None :
3369+ # we need the pad_token_id to know how to chop down position_embd matrix
3370+ if (pad_token_id := self .hparams .get ("pad_token_id" )) is not None :
3371+ self ._position_offset = 1 + pad_token_id
3372+ if "max_position_embeddings" in self .hparams :
3373+ self .hparams ["max_position_embeddings" ] -= self ._position_offset
3374+ else :
3375+ self ._position_offset = None
3376+
3377+ def _xlmroberta_set_vocab (self ) -> None :
3378+ # to avoid TypeError: Descriptors cannot be created directly
3379+ # exception when importing sentencepiece_model_pb2
3380+ os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
3381+ from sentencepiece import SentencePieceProcessor
3382+ from sentencepiece import sentencepiece_model_pb2 as model
3383+
3384+ tokenizer_path = self .dir_model / 'sentencepiece.bpe.model'
3385+ if not tokenizer_path .is_file ():
3386+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3387+
3388+ sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3389+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3390+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3391+
3392+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
3393+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
3394+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
3395+
3396+ tokenizer = SentencePieceProcessor ()
3397+ tokenizer .LoadFromFile (str (tokenizer_path ))
3398+
3399+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3400+
3401+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3402+ scores : list [float ] = [- 10000.0 ] * vocab_size
3403+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3404+
3405+ for token_id in range (tokenizer .vocab_size ()):
3406+ piece = tokenizer .IdToPiece (token_id )
3407+ text = piece .encode ("utf-8" )
3408+ score = tokenizer .GetScore (token_id )
3409+
3410+ toktype = SentencePieceTokenTypes .NORMAL
3411+ if tokenizer .IsUnknown (token_id ):
3412+ toktype = SentencePieceTokenTypes .UNKNOWN
3413+ elif tokenizer .IsControl (token_id ):
3414+ toktype = SentencePieceTokenTypes .CONTROL
3415+ elif tokenizer .IsUnused (token_id ):
3416+ toktype = SentencePieceTokenTypes .UNUSED
3417+ elif tokenizer .IsByte (token_id ):
3418+ toktype = SentencePieceTokenTypes .BYTE
3419+
3420+ tokens [token_id ] = text
3421+ scores [token_id ] = score
3422+ toktypes [token_id ] = toktype
3423+
3424+ if vocab_size > len (tokens ):
3425+ pad_count = vocab_size - len (tokens )
3426+ logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3427+ for i in range (1 , pad_count + 1 ):
3428+ tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3429+ scores .append (- 1000.0 )
3430+ toktypes .append (SentencePieceTokenTypes .UNUSED )
3431+
3432+ # realign tokens (see HF tokenizer code)
3433+ tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3434+ scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3435+ toktypes = [
3436+ SentencePieceTokenTypes .CONTROL ,
3437+ SentencePieceTokenTypes .CONTROL ,
3438+ SentencePieceTokenTypes .CONTROL ,
3439+ SentencePieceTokenTypes .UNKNOWN ,
3440+ ] + toktypes [3 :- 1 ]
3441+
3442+ self .gguf_writer .add_tokenizer_model ("t5" )
3443+ self .gguf_writer .add_tokenizer_pre ("default" )
3444+ self .gguf_writer .add_token_list (tokens )
3445+ self .gguf_writer .add_token_scores (scores )
3446+ self .gguf_writer .add_token_types (toktypes )
3447+ self .gguf_writer .add_add_space_prefix (add_prefix )
3448+ self .gguf_writer .add_token_type_count (self .hparams .get ("type_vocab_size" , 1 ))
3449+ self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
3450+ if precompiled_charsmap :
3451+ self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
3452+
3453+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
3454+ special_vocab .add_to_gguf (self .gguf_writer )
3455+
3456+ self .gguf_writer .add_add_bos_token (True )
3457+ self .gguf_writer .add_add_eos_token (True )
3458+
33683459
33693460@ModelBase .register ("RobertaModel" )
33703461class RobertaModel (BertModel ):
@@ -3423,6 +3514,10 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
34233514
34243515 super ().__init__ (dir_model , ftype , fname_out , hparams = hparams , ** kwargs )
34253516
3517+ self ._tokenizer_is_xlmroberta = self ._is_tokenizer_xlmroberta ()
3518+ if self ._tokenizer_is_xlmroberta :
3519+ self ._xlmroberta_tokenizer_init ()
3520+
34263521 # the HF config claims n_ctx=8192, but it uses RoPE scaling
34273522 self .hparams ["n_ctx" ] = 2048
34283523
@@ -3442,6 +3537,11 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
34423537 assert self .hparams ["rotary_emb_interleaved" ] is False
34433538 assert self .hparams ["rotary_emb_scale_base" ] is None
34443539
3540+ def set_vocab (self ) -> None :
3541+ if self ._tokenizer_is_xlmroberta :
3542+ return self ._xlmroberta_set_vocab ()
3543+ return super ().set_vocab ()
3544+
34453545 def modify_tensors (self , data_torch : torch .Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , torch .Tensor ]]:
34463546 # If the tensor is an experts bias tensor, skip it by returning an empty list.
34473547 if "mlp.experts.bias" in name :
@@ -3466,103 +3566,27 @@ def set_gguf_parameters(self):
34663566 self .gguf_writer .add_expert_count (self .hparams ["num_experts" ])
34673567 self .gguf_writer .add_expert_used_count (self .hparams ["moe_top_k" ])
34683568
3569+ def _is_tokenizer_xlmroberta (self ) -> bool :
3570+ with open (self .dir_model / "tokenizer.json" ) as f :
3571+ tokenizer_json = json .load (f )
3572+ toktyp = tokenizer_json ["model" ]["type" ]
3573+ if toktyp == "Unigram" :
3574+ return True
3575+ if toktyp == "WordPiece" :
3576+ return False
3577+ raise ValueError (f"unknown tokenizer: { toktyp } " )
3578+
34693579
34703580@ModelBase .register ("XLMRobertaModel" , "XLMRobertaForSequenceClassification" )
34713581class XLMRobertaModel (BertModel ):
34723582 model_arch = gguf .MODEL_ARCH .BERT
34733583
34743584 def __init__ (self , * args , ** kwargs ):
34753585 super ().__init__ (* args , ** kwargs )
3476-
3477- # we need the pad_token_id to know how to chop down position_embd matrix
3478- if (pad_token_id := self .hparams .get ("pad_token_id" )) is not None :
3479- self ._position_offset = 1 + pad_token_id
3480- if "max_position_embeddings" in self .hparams :
3481- self .hparams ["max_position_embeddings" ] -= self ._position_offset
3482- else :
3483- self ._position_offset = None
3586+ self ._xlmroberta_tokenizer_init ()
34843587
34853588 def set_vocab (self ):
3486- # to avoid TypeError: Descriptors cannot be created directly
3487- # exception when importing sentencepiece_model_pb2
3488- os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
3489- from sentencepiece import SentencePieceProcessor
3490- from sentencepiece import sentencepiece_model_pb2 as model
3491-
3492- tokenizer_path = self .dir_model / 'sentencepiece.bpe.model'
3493- if not tokenizer_path .is_file ():
3494- raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3495-
3496- sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3497- sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3498- assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3499-
3500- add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
3501- remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
3502- precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
3503-
3504- tokenizer = SentencePieceProcessor ()
3505- tokenizer .LoadFromFile (str (tokenizer_path ))
3506-
3507- vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3508-
3509- tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3510- scores : list [float ] = [- 10000.0 ] * vocab_size
3511- toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3512-
3513- for token_id in range (tokenizer .vocab_size ()):
3514- piece = tokenizer .IdToPiece (token_id )
3515- text = piece .encode ("utf-8" )
3516- score = tokenizer .GetScore (token_id )
3517-
3518- toktype = SentencePieceTokenTypes .NORMAL
3519- if tokenizer .IsUnknown (token_id ):
3520- toktype = SentencePieceTokenTypes .UNKNOWN
3521- elif tokenizer .IsControl (token_id ):
3522- toktype = SentencePieceTokenTypes .CONTROL
3523- elif tokenizer .IsUnused (token_id ):
3524- toktype = SentencePieceTokenTypes .UNUSED
3525- elif tokenizer .IsByte (token_id ):
3526- toktype = SentencePieceTokenTypes .BYTE
3527-
3528- tokens [token_id ] = text
3529- scores [token_id ] = score
3530- toktypes [token_id ] = toktype
3531-
3532- if vocab_size > len (tokens ):
3533- pad_count = vocab_size - len (tokens )
3534- logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3535- for i in range (1 , pad_count + 1 ):
3536- tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3537- scores .append (- 1000.0 )
3538- toktypes .append (SentencePieceTokenTypes .UNUSED )
3539-
3540- # realign tokens (see HF tokenizer code)
3541- tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3542- scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3543- toktypes = [
3544- SentencePieceTokenTypes .CONTROL ,
3545- SentencePieceTokenTypes .CONTROL ,
3546- SentencePieceTokenTypes .CONTROL ,
3547- SentencePieceTokenTypes .UNKNOWN ,
3548- ] + toktypes [3 :- 1 ]
3549-
3550- self .gguf_writer .add_tokenizer_model ("t5" )
3551- self .gguf_writer .add_tokenizer_pre ("default" )
3552- self .gguf_writer .add_token_list (tokens )
3553- self .gguf_writer .add_token_scores (scores )
3554- self .gguf_writer .add_token_types (toktypes )
3555- self .gguf_writer .add_add_space_prefix (add_prefix )
3556- self .gguf_writer .add_token_type_count (self .hparams .get ("type_vocab_size" , 1 ))
3557- self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
3558- if precompiled_charsmap :
3559- self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
3560-
3561- special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
3562- special_vocab .add_to_gguf (self .gguf_writer )
3563-
3564- self .gguf_writer .add_add_bos_token (True )
3565- self .gguf_writer .add_add_eos_token (True )
3589+ self ._xlmroberta_set_vocab ()
35663590
35673591 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
35683592 # if name starts with "roberta.", remove the prefix
0 commit comments