@@ -663,7 +663,11 @@ def get_vocab_base_pre(self, tokenizer) -> str:
663663 res = "minerva-7b"
664664 if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65" :
665665 # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
666- res = "roberta-bpe"
666+ # NOTE: The Roberta tokenizer is the same as GPT-2, but it always
667+ # adds the cls/sep tokens as bos/eos. This is handled as a
668+ # post-processor in tokenizers, so the chkhsh is different, but
669+ # it still maps to gpt-2 internally.
670+ res = "gpt-2"
667671
668672 if res is None :
669673 logger .warning ("\n " )
@@ -2544,7 +2548,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
25442548 return [(self .map_tensor_name (name ), data_torch )]
25452549
25462550
2547- @Model .register ("BertModel" , "CamembertModel" , "RobertaModel" )
2551+ @Model .register ("BertModel" , "CamembertModel" )
25482552class BertModel (Model ):
25492553 model_arch = gguf .MODEL_ARCH .BERT
25502554
@@ -2617,6 +2621,27 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26172621 return [(self .map_tensor_name (name ), data_torch )]
26182622
26192623
2624+ @Model .register ("RobertaModel" )
2625+ class RobertaModel (BertModel ):
2626+ model_arch = gguf .MODEL_ARCH .BERT
2627+
2628+ def set_vocab (self ):
2629+ """Support BPE tokenizers for roberta models"""
2630+ bpe_tok_path = self .dir_model / "tokenizer.json"
2631+ if bpe_tok_path .exists ():
2632+ self ._set_vocab_gpt2 ()
2633+ self .gguf_writer .add_add_bos_token (True )
2634+ self .gguf_writer .add_add_eos_token (True )
2635+
2636+ # we need this to validate the size of the token_type embeddings
2637+ # though currently we are passing all zeros to the token_type embeddings
2638+ # "Sequence A" or "Sequence B"
2639+ self .gguf_writer .add_token_type_count (self .hparams .get ("type_vocab_size" , 1 ))
2640+
2641+ else :
2642+ return super ().set_vocab ()
2643+
2644+
26202645@Model .register ("NomicBertModel" )
26212646class NomicBertModel (BertModel ):
26222647 model_arch = gguf .MODEL_ARCH .NOMIC_BERT
0 commit comments