Skip to content

Commit df4580e

Browse files
committed
Fix ChatGLMModel for glm-4-9b cannot find tokenizer merges in model file
1 parent 0019279 commit df4580e

File tree

1 file changed

+53
-6
lines changed

1 file changed

+53
-6
lines changed

convert_hf_to_gguf.py

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
538538
toktypes: list[int] = []
539539

540540
from transformers import AutoTokenizer
541-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
541+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
542542
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
543543
assert max(tokenizer.vocab.values()) < vocab_size
544544

@@ -738,6 +738,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
738738
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
739739
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
740740
res = "glm4"
741+
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
742+
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
743+
res = "glm4"
741744

742745
if res is None:
743746
logger.warning("\n")
@@ -5022,16 +5025,60 @@ def set_vocab(self):
50225025

50235026
from transformers import AutoTokenizer
50245027
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
5025-
vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
5028+
vocab_size = hparams.get("padded_vocab_size",hparams.get("vocab_size"))
50265029
assert max(tokenizer.get_vocab().values()) < vocab_size
50275030

5028-
tokens, toktypes, tokpre = self.get_vocab_base()
5029-
self.gguf_writer.add_tokenizer_model("gpt2")
5031+
tokpre = self.get_vocab_base_pre(tokenizer)
5032+
5033+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()}
5034+
added_vocab = tokenizer.get_added_vocab()
5035+
5036+
added_tokens_decoder = tokenizer.added_tokens_decoder
5037+
5038+
for i in range(vocab_size):
5039+
if i not in reverse_vocab:
5040+
tokens.append(f"[PAD{i}]")
5041+
toktypes.append(gguf.TokenType.UNUSED)
5042+
else:
5043+
token: str = reverse_vocab[i]
5044+
if token in added_vocab:
5045+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
5046+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
5047+
if not added_tokens_decoder[i].normalized:
5048+
previous_token = token
5049+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
5050+
if previous_token != token:
5051+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
5052+
5053+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
5054+
toktypes.append(gguf.TokenType.CONTROL)
5055+
else:
5056+
# NOTE: this was added for Gemma.
5057+
# Encoding and decoding the tokens above isn't sufficient for this case.
5058+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
5059+
toktypes.append(gguf.TokenType.USER_DEFINED)
5060+
else:
5061+
toktypes.append(gguf.TokenType.NORMAL)
5062+
tokens.append(token)
5063+
5064+
self.gguf_writer.add_tokenizer_model("llama")
50305065
self.gguf_writer.add_tokenizer_pre(tokpre)
50315066
self.gguf_writer.add_token_list(tokens)
50325067
self.gguf_writer.add_token_types(toktypes)
5033-
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
5068+
5069+
special_vocab=gguf.SpecialVocab(
5070+
self.dir_model,
5071+
load_merges=False,
5072+
n_vocab=vocab_size
5073+
)
50345074
# only add special tokens when they were not already loaded from config.json
5075+
5076+
#TODO In llama.cpp, special tokens are mapped one-to-one between a token and a coordinate. However, in reality, a transformer might associate a special token like eos_token_id with multiple tokens.
5077+
# Currently, llama.cpp only supports a one-to-one mapping.
5078+
# This can lead to an issue where the model fails to terminate properly.
5079+
# I'm still unclear about how llama.cpp handles special_token and what the exact call chain is!
5080+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|observation|>"])
5081+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|user|>"])
50355082
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
50365083
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
50375084
# this one is usually not in config.json anyway
@@ -5045,7 +5092,7 @@ def set_gguf_parameters(self):
50455092
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
50465093
self.gguf_writer.add_embedding_length(n_embed)
50475094
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
5048-
self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
5095+
self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")))
50495096
self.gguf_writer.add_head_count(n_head)
50505097
self.gguf_writer.add_head_count_kv(n_head_kv)
50515098
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))

0 commit comments

Comments
 (0)