Skip to content

Commit 1274c8c

Browse files
authored
fix vocab parsing with only tokenizer.json
1 parent 4ac1380 commit 1274c8c

File tree

1 file changed

+87
-45
lines changed

1 file changed

+87
-45
lines changed

convert_hf_to_gguf.py

Lines changed: 87 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -802,9 +802,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
802802
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
803803
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
804804
res = "seed-coder"
805-
if chkhsh == "a81863d07e75497e2194eb1a1574d5e5cd4d5f85a87a0728b922bf2bed6fb327":
806-
# ref: https://huggingface.co/jinaai/jina-embeddings-v3
807-
res = "jina-v3"
808805

809806
if res is None:
810807
logger.warning("\n")
@@ -3626,44 +3623,93 @@ def _xlmroberta_set_vocab(self) -> None:
36263623
from sentencepiece import sentencepiece_model_pb2 as model
36273624

36283625
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
3626+
3627+
tokenizer_json = {}
3628+
tokenizer_config_json = {}
36293629
if not tokenizer_path.is_file():
3630-
raise FileNotFoundError(f"File not found: {tokenizer_path}")
3630+
tokenizer_path = self.dir_model / 'tokenizer.json'
3631+
tokenizer_config_path = self.dir_model / 'tokenizer_config.json'
36313632

3632-
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3633-
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3634-
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3633+
if not tokenizer_path.is_file():
3634+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
36353635

3636-
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3637-
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3638-
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3636+
from base64 import b64decode
3637+
from transformers import AutoTokenizer
3638+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
36393639

3640-
tokenizer = SentencePieceProcessor()
3641-
tokenizer.LoadFromFile(str(tokenizer_path))
3640+
with open(tokenizer_path, "r", encoding="utf-8") as fp:
3641+
tokenizer_json = json.load(fp)
36423642

3643-
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3643+
if tokenizer_config_path.is_file():
3644+
with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
3645+
tokenizer_config_json = json.load(fp)
3646+
3647+
add_prefix = tokenizer.add_prefix_space
3648+
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
3649+
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
3650+
3651+
vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
3652+
else:
3653+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3654+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3655+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3656+
3657+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3658+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3659+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3660+
3661+
tokenizer = SentencePieceProcessor()
3662+
tokenizer.LoadFromFile(str(tokenizer_path))
3663+
3664+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
36443665

36453666
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
36463667
scores: list[float] = [-10000.0] * vocab_size
36473668
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
36483669

3649-
for token_id in range(tokenizer.vocab_size()):
3650-
piece = tokenizer.IdToPiece(token_id)
3651-
text = piece.encode("utf-8")
3652-
score = tokenizer.GetScore(token_id)
3670+
if isinstance(tokenizer, SentencePieceProcessor):
3671+
for token_id in range(vocab_size):
3672+
piece = tokenizer.IdToPiece(token_id)
3673+
text = piece.encode("utf-8")
3674+
score = tokenizer.GetScore(token_id)
36533675

3654-
toktype = SentencePieceTokenTypes.NORMAL
3655-
if tokenizer.IsUnknown(token_id):
3656-
toktype = SentencePieceTokenTypes.UNKNOWN
3657-
elif tokenizer.IsControl(token_id):
3658-
toktype = SentencePieceTokenTypes.CONTROL
3659-
elif tokenizer.IsUnused(token_id):
3660-
toktype = SentencePieceTokenTypes.UNUSED
3661-
elif tokenizer.IsByte(token_id):
3662-
toktype = SentencePieceTokenTypes.BYTE
3676+
toktype = SentencePieceTokenTypes.NORMAL
3677+
if tokenizer.IsUnknown(token_id):
3678+
toktype = SentencePieceTokenTypes.UNKNOWN
3679+
elif tokenizer.IsControl(token_id):
3680+
toktype = SentencePieceTokenTypes.CONTROL
3681+
elif tokenizer.IsUnused(token_id):
3682+
toktype = SentencePieceTokenTypes.UNUSED
3683+
elif tokenizer.IsByte(token_id):
3684+
toktype = SentencePieceTokenTypes.BYTE
36633685

3664-
tokens[token_id] = text
3665-
scores[token_id] = score
3666-
toktypes[token_id] = toktype
3686+
tokens[token_id] = text
3687+
scores[token_id] = score
3688+
toktypes[token_id] = toktype
3689+
else:
3690+
added_vocab = tokenizer.get_added_vocab()
3691+
unk_token = tokenizer_config_json.get("unk_token")
3692+
unk_token_id = added_vocab.get(unk_token, 3)
3693+
3694+
for token_id in range(vocab_size):
3695+
piece = tokenizer._convert_id_to_token(token_id)
3696+
text = piece.encode("utf-8")
3697+
score = tokenizer_json["model"]["vocab"][token_id][1]
3698+
3699+
toktype = SentencePieceTokenTypes.NORMAL
3700+
if token_id == unk_token_id:
3701+
toktype = SentencePieceTokenTypes.UNKNOWN
3702+
elif token_id in tokenizer.all_special_ids:
3703+
toktype = SentencePieceTokenTypes.CONTROL
3704+
elif token_id in added_vocab.values():
3705+
toktype = SentencePieceTokenTypes.USER_DEFINED
3706+
# No reliable way to detect this, but jina-embeddings-v3 doesn't have any
3707+
# elif tokenizer.IsByte(token_id):
3708+
# toktype = SentencePieceTokenTypes.BYTE
3709+
3710+
tokens[token_id] = text
3711+
scores[token_id] = score
3712+
toktypes[token_id] = toktype
36673713

36683714
if vocab_size > len(tokens):
36693715
pad_count = vocab_size - len(tokens)
@@ -3673,15 +3719,16 @@ def _xlmroberta_set_vocab(self) -> None:
36733719
scores.append(-1000.0)
36743720
toktypes.append(SentencePieceTokenTypes.UNUSED)
36753721

3676-
# realign tokens (see HF tokenizer code)
3677-
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
3678-
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
3679-
toktypes = [
3680-
SentencePieceTokenTypes.CONTROL,
3681-
SentencePieceTokenTypes.CONTROL,
3682-
SentencePieceTokenTypes.CONTROL,
3683-
SentencePieceTokenTypes.UNKNOWN,
3684-
] + toktypes[3:-1]
3722+
if isinstance(tokenizer, SentencePieceProcessor):
3723+
# realign tokens (see HF tokenizer code)
3724+
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
3725+
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
3726+
toktypes = [
3727+
SentencePieceTokenTypes.CONTROL,
3728+
SentencePieceTokenTypes.CONTROL,
3729+
SentencePieceTokenTypes.CONTROL,
3730+
SentencePieceTokenTypes.UNKNOWN,
3731+
] + toktypes[3:-1]
36853732

36863733
self.gguf_writer.add_tokenizer_model("t5")
36873734
self.gguf_writer.add_tokenizer_pre("default")
@@ -3841,15 +3888,10 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
38413888
self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
38423889

38433890
super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
3844-
3845-
self._tokenizer_is_xlmroberta = False if self.model_arch == gguf.MODEL_ARCH.JINA_BERT_V3 else True
3846-
if self._tokenizer_is_xlmroberta:
3847-
self._xlmroberta_tokenizer_init()
3891+
self._xlmroberta_tokenizer_init()
38483892

38493893
def set_vocab(self):
3850-
if self._tokenizer_is_xlmroberta:
3851-
return self._xlmroberta_set_vocab()
3852-
return super().set_vocab()
3894+
self._xlmroberta_set_vocab()
38533895

38543896
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
38553897
# if name starts with "roberta.", remove the prefix

0 commit comments

Comments
 (0)