Skip to content

Commit 490a13f

Browse files
committed
move tokenizer changes to sub class
1 parent c71543c commit 490a13f

File tree

1 file changed

+42
-5
lines changed

1 file changed

+42
-5
lines changed

convert_hf_to_gguf.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -608,13 +608,12 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
608608

609609
from transformers import AutoTokenizer
610610
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
611-
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
612-
vocab_size = self.hparams.get("vocab_size", len(vocab))
613-
assert max(vocab.values()) < vocab_size
611+
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
612+
assert max(tokenizer.vocab.values()) < vocab_size
614613

615614
tokpre = self.get_vocab_base_pre(tokenizer)
616615

617-
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
616+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
618617
added_vocab = tokenizer.get_added_vocab()
619618

620619
added_tokens_decoder = tokenizer.added_tokens_decoder
@@ -3212,7 +3211,45 @@ def set_vocab(self):
32123211
self._set_vocab_gpt2()
32133212

32143213
def _set_vocab_interns1(self):
3215-
tokens, toktypes, tokpre = self.get_vocab_base()
3214+
tokens: list[str] = []
3215+
toktypes: list[int] = []
3216+
3217+
from transformers import AutoTokenizer
3218+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
3219+
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
3220+
vocab_size = self.hparams.get("vocab_size", len(vocab))
3221+
assert max(vocab.values()) < vocab_size
3222+
3223+
tokpre = self.get_vocab_base_pre(tokenizer)
3224+
3225+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
3226+
added_vocab = tokenizer.get_added_vocab()
3227+
3228+
added_tokens_decoder = tokenizer.added_tokens_decoder
3229+
3230+
for i in range(vocab_size):
3231+
if i not in reverse_vocab:
3232+
tokens.append(f"[PAD{i}]")
3233+
toktypes.append(gguf.TokenType.UNUSED)
3234+
else:
3235+
token: str = reverse_vocab[i]
3236+
if token in added_vocab:
3237+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3238+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
3239+
if not added_tokens_decoder[i].normalized:
3240+
previous_token = token
3241+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
3242+
if previous_token != token:
3243+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
3244+
3245+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
3246+
toktypes.append(gguf.TokenType.CONTROL)
3247+
else:
3248+
toktypes.append(gguf.TokenType.USER_DEFINED)
3249+
else:
3250+
toktypes.append(gguf.TokenType.NORMAL)
3251+
tokens.append(token)
3252+
32163253
self.gguf_writer.add_tokenizer_model("gpt2")
32173254
self.gguf_writer.add_tokenizer_pre(tokpre)
32183255
self.gguf_writer.add_token_list(tokens)

0 commit comments

Comments
 (0)