Skip to content

Commit 4ce6630

Browse files
committed
update tokenizer_model
1 parent df4580e commit 4ce6630

File tree

1 file changed

+7
-16
lines changed

1 file changed

+7
-16
lines changed

convert_hf_to_gguf.py

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
538538
toktypes: list[int] = []
539539

540540
from transformers import AutoTokenizer
541-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
541+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
542542
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
543543
assert max(tokenizer.vocab.values()) < vocab_size
544544

@@ -738,9 +738,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
738738
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
739739
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
740740
res = "glm4"
741-
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
742-
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
743-
res = "glm4"
744741

745742
if res is None:
746743
logger.warning("\n")
@@ -5025,7 +5022,7 @@ def set_vocab(self):
50255022

50265023
from transformers import AutoTokenizer
50275024
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
5028-
vocab_size = hparams.get("padded_vocab_size",hparams.get("vocab_size"))
5025+
vocab_size = hparams.get("padded_vocab_size", hparams.get("vocab_size"))
50295026
assert max(tokenizer.get_vocab().values()) < vocab_size
50305027

50315028
tokpre = self.get_vocab_base_pre(tokenizer)
@@ -5052,16 +5049,12 @@ def set_vocab(self):
50525049

50535050
if added_tokens_decoder[i].special or self.does_token_look_special(token):
50545051
toktypes.append(gguf.TokenType.CONTROL)
5055-
else:
5056-
# NOTE: this was added for Gemma.
5057-
# Encoding and decoding the tokens above isn't sufficient for this case.
5058-
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
5059-
toktypes.append(gguf.TokenType.USER_DEFINED)
5052+
50605053
else:
50615054
toktypes.append(gguf.TokenType.NORMAL)
50625055
tokens.append(token)
50635056

5064-
self.gguf_writer.add_tokenizer_model("llama")
5057+
self.gguf_writer.add_tokenizer_model("gpt2")
50655058
self.gguf_writer.add_tokenizer_pre(tokpre)
50665059
self.gguf_writer.add_token_list(tokens)
50675060
self.gguf_writer.add_token_types(toktypes)
@@ -5076,11 +5069,9 @@ def set_vocab(self):
50765069
#TODO In llama.cpp, special tokens are mapped one-to-one between a token and a coordinate. However, in reality, a transformer might associate a special token like eos_token_id with multiple tokens.
50775070
# Currently, llama.cpp only supports a one-to-one mapping.
50785071
# This can lead to an issue where the model fails to terminate properly.
5079-
# I'm still unclear about how llama.cpp handles special_token and what the exact call chain is!
5080-
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|observation|>"])
5081-
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|user|>"])
5082-
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
5083-
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
5072+
# You can see a temporary workaround here. https://github.com/ggml-org/llama.cpp/issues/9606
5073+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
5074+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
50845075
# this one is usually not in config.json anyway
50855076
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
50865077
special_vocab.add_to_gguf(self.gguf_writer)

0 commit comments

Comments
 (0)