Skip to content

Commit 69f0ae5

Browse files
committed
glm 4.5 set eos/eog/eot token to <|user|>
1 parent c56a513 commit 69f0ae5

File tree

1 file changed

+6
-26
lines changed

1 file changed

+6
-26
lines changed

convert_hf_to_gguf.py

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6603,39 +6603,19 @@ def set_vocab(self):
66036603
self.gguf_writer.add_token_types(toktypes)
66046604

66056605
# Special tokens
6606-
# BOS should be [gMASK] (151331), EOS should be <|endoftext|> (151329) as per tokenizer analysis
6607-
special_vocab._set_special_token(
6608-
"eos", tokenizer.get_added_vocab()["<|endoftext|>"] # 151329 - correct EOS token
6609-
)
6610-
special_vocab._set_special_token(
6611-
"eot", tokenizer.get_added_vocab()["<|endoftext|>"] # 151329 - same as EOS
6612-
)
6613-
special_vocab._set_special_token(
6614-
"unk", tokenizer.get_added_vocab()["<|endoftext|>"]
6615-
)
6616-
special_vocab._set_special_token(
6617-
"bos", tokenizer.get_added_vocab()["[gMASK]"] # 151331
6618-
)
6606+
# Note: Using <|endoftext|> (151329) for eos and eot causes endless generation
6607+
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
6608+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|user|>"]) # 151336 - end of
6609+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336 - same as EOS
6610+
special_vocab._set_special_token("eog", tokenizer.get_added_vocab()["<|user|>"]) # 151336 - same as EOS
6611+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
66196612
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
66206613

66216614
if "<sop>" in tokenizer.get_added_vocab():
66226615
special_vocab._set_special_token("sop", tokenizer.get_added_vocab()["<sop>"]) # 151333
66236616
if "<eop>" in tokenizer.get_added_vocab():
66246617
special_vocab._set_special_token("eop", tokenizer.get_added_vocab()["<eop>"]) # 151334
66256618

6626-
# TODO: clean up once decided on an approach to think and /nothink
6627-
#
6628-
# Previously:
6629-
# if "/nothink" in tokenizer.get_added_vocab():
6630-
# special_vocab._set_special_token("nothink", tokenizer.get_added_vocab()["/nothink"]) # 151360
6631-
# Note: <think> and </think> are regular tokens (special=false in official config), not special tokens
6632-
#
6633-
# Latest thinking is:
6634-
# NOTE: /nothink token exists but causes generation issues as mentioned in
6635-
# https://huggingface.co/zai-org/GLM-4.5/discussions/9
6636-
# "it is a very special token. Even as input, it will be encoded into a special token, causing generation issues."
6637-
# Therefore we do NOT add it to avoid generation problems
6638-
66396619
special_vocab.add_to_gguf(self.gguf_writer)
66406620

66416621
def set_gguf_parameters(self):

0 commit comments

Comments
 (0)