Skip to content

Commit a2e03b8

Browse files
committed
fix: Use gpt2 tokenizer for roberta and add eos/bos tokens
Branch: RobertaTokenizer Signed-off-by: Gabe Goodhart <[email protected]>
1 parent a76c56f commit a2e03b8

File tree

1 file changed

+27
-2
lines changed

1 file changed

+27
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -663,7 +663,11 @@ def get_vocab_base_pre(self, tokenizer) -> str:
663663
res = "minerva-7b"
664664
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
665665
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
666-
res = "roberta-bpe"
666+
# NOTE: The Roberta tokenizer is the same as GPT-2, but it always
667+
# adds the cls/sep tokens as bos/eos. This is handled as a
668+
# post-processor in tokenizers, so the chkhsh is different, but
669+
# it still maps to gpt-2 internally.
670+
res = "gpt-2"
667671

668672
if res is None:
669673
logger.warning("\n")
@@ -2544,7 +2548,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
25442548
return [(self.map_tensor_name(name), data_torch)]
25452549

25462550

2547-
@Model.register("BertModel", "CamembertModel", "RobertaModel")
2551+
@Model.register("BertModel", "CamembertModel")
25482552
class BertModel(Model):
25492553
model_arch = gguf.MODEL_ARCH.BERT
25502554

@@ -2617,6 +2621,27 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26172621
return [(self.map_tensor_name(name), data_torch)]
26182622

26192623

2624+
@Model.register("RobertaModel")
2625+
class RobertaModel(BertModel):
2626+
model_arch = gguf.MODEL_ARCH.BERT
2627+
2628+
def set_vocab(self):
2629+
"""Support BPE tokenizers for roberta models"""
2630+
bpe_tok_path = self.dir_model / "tokenizer.json"
2631+
if bpe_tok_path.exists():
2632+
self._set_vocab_gpt2()
2633+
self.gguf_writer.add_add_bos_token(True)
2634+
self.gguf_writer.add_add_eos_token(True)
2635+
2636+
# we need this to validate the size of the token_type embeddings
2637+
# though currently we are passing all zeros to the token_type embeddings
2638+
# "Sequence A" or "Sequence B"
2639+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2640+
2641+
else:
2642+
return super().set_vocab()
2643+
2644+
26202645
@Model.register("NomicBertModel")
26212646
class NomicBertModel(BertModel):
26222647
model_arch = gguf.MODEL_ARCH.NOMIC_BERT

0 commit comments

Comments
 (0)