Skip to content

Commit 38d5910

Browse files
Kimi-K2 conversion
1 parent c31e606 commit 38d5910

File tree

1 file changed

+47
-1
lines changed

1 file changed

+47
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5563,7 +5563,53 @@ class DeepseekV2Model(TextModel):
55635563
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
55645564

55655565
def set_vocab(self):
5566-
self._set_vocab_gpt2()
5566+
if(self.hparams["vocab_size"]==163840): # Kimi-K2 model
5567+
from transformers import AutoTokenizer
5568+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
5569+
tokpre = "kimi-k2" # TODO: add identifier hash
5570+
5571+
# Build merges list using the approach similar to HunYuanMoE
5572+
merges = []
5573+
vocab = {}
5574+
mergeable_ranks = tokenizer.model._mergeable_ranks
5575+
for token, rank in mergeable_ranks.items():
5576+
vocab[QwenModel.token_bytes_to_string(token)] = rank
5577+
if len(token) == 1:
5578+
continue
5579+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
5580+
if len(merged) == 2:
5581+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
5582+
5583+
5584+
# Build token list
5585+
vocab_size = self.hparams["vocab_size"]
5586+
special_tokens = tokenizer.special_tokens
5587+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
5588+
tokens: list[str] = []
5589+
toktypes: list[int] = []
5590+
5591+
for i in range(tokenizer.vocab_size):
5592+
if i not in reverse_vocab:
5593+
tokens.append(f"[PAD{i}]")
5594+
toktypes.append(gguf.TokenType.UNUSED)
5595+
else:
5596+
token = reverse_vocab[i]
5597+
tokens.append(token)
5598+
if i in special_tokens.values():
5599+
toktypes.append(gguf.TokenType.CONTROL)
5600+
else:
5601+
toktypes.append(gguf.TokenType.NORMAL)
5602+
5603+
self.gguf_writer.add_tokenizer_model("gpt2")
5604+
self.gguf_writer.add_tokenizer_pre(tokpre)
5605+
self.gguf_writer.add_token_list(tokens)
5606+
self.gguf_writer.add_token_types(toktypes)
5607+
self.gguf_writer.add_token_merges(merges)
5608+
5609+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
5610+
special_vocab.add_to_gguf(self.gguf_writer)
5611+
else:
5612+
self._set_vocab_gpt2()
55675613

55685614
def set_gguf_parameters(self):
55695615

0 commit comments

Comments
 (0)