Skip to content

Commit f7a04ad

Browse files
committed
Update convert_hf_to_gguf.py
1 parent 277a05a commit f7a04ad

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5605,8 +5605,17 @@ def set_vocab(self):
56055605
print(f'tokenizer.vocab_size = {tokenizer.vocab_size}', flush = True)
56065606
print("====================================", flush = True)
56075607

5608-
assert tokenizer.vocab_size == vocab_size
5608+
# Kimi-K2 adds 2 extra reserved tokens which is incorrect
56095609
special_tokens = tokenizer.special_tokens
5610+
tokenizer_vocab_size = tokenizer.vocab_size
5611+
bad_tokens = ["<|reserved_token_163840|>", "<|reserved_token_163841|>"]
5612+
for bad_token in bad_tokens:
5613+
if bad_token in special_tokens and special_tokens[bad_token] >= vocab_size:
5614+
print(f"Removing bad reserved token = {bad_token}")
5615+
special_tokens.pop(bad_token)
5616+
tokenizer_vocab_size -= 1
5617+
5618+
assert tokenizer_vocab_size == vocab_size
56105619
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
56115620
tokens: list[str] = []
56125621
toktypes: list[int] = []

0 commit comments

Comments
 (0)