Skip to content

Commit 52642ba

Browse files
Update convert_hf_to_gguf.py
Enable GGUF conversion for Moonlight-16B-A3B with TikTokenTokenizer - Modified get_vocab_base to handle TikTokenTokenizer using vocab_size and decode([i]), bypassing .vocab requirement. - Added trust_remote_code=True to AutoTokenizer.from_pretrained for custom tokenizer loading. - Added pre-tokenizer hash "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890" to get_vocab_base_pre as "moonlight-a3b". - Tested successfully with local Moonlight-16B-A3B model.
1 parent 7ad0779 commit 52642ba

File tree

1 file changed

+60
-48
lines changed

1 file changed

+60
-48
lines changed

convert_hf_to_gguf.py

Lines changed: 60 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -515,45 +515,65 @@ def does_token_look_special(self, token: str | bytes) -> bool:
515515

516516
# used for GPT-2 BPE and WordPiece vocabs
517517
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
518-
tokens: list[str] = []
519-
toktypes: list[int] = []
520-
521518
from transformers import AutoTokenizer
522-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
523-
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
524-
assert max(tokenizer.vocab.values()) < vocab_size
519+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
525520

526-
tokpre = self.get_vocab_base_pre(tokenizer)
527-
528-
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
529-
added_vocab = tokenizer.get_added_vocab()
521+
tokens: list[str] = []
522+
toktypes: list[int] = []
530523

531-
for i in range(vocab_size):
532-
if i not in reverse_vocab:
533-
tokens.append(f"[PAD{i}]")
534-
toktypes.append(gguf.TokenType.UNUSED)
535-
else:
536-
token: str = reverse_vocab[i]
537-
if token in added_vocab:
538-
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
539-
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
540-
if not tokenizer.added_tokens_decoder[i].normalized:
541-
previous_token = token
542-
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
543-
if previous_token != token:
544-
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
545-
546-
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
547-
toktypes.append(gguf.TokenType.CONTROL)
524+
if hasattr(tokenizer, "vocab"):
525+
# Standard Hugging Face tokenizer (e.g., GPT-2, BERT)
526+
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
527+
reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()}
528+
assert max(tokenizer.vocab.values()) < vocab_size, "Vocab IDs exceed vocab_size"
529+
added_vocab = tokenizer.get_added_vocab()
530+
531+
for i in range(vocab_size):
532+
if i not in reverse_vocab:
533+
tokens.append(f"[PAD{i}]")
534+
toktypes.append(gguf.TokenType.UNUSED)
535+
else:
536+
token = reverse_vocab[i]
537+
if token in added_vocab:
538+
if hasattr(tokenizer, "added_tokens_decoder") and i in tokenizer.added_tokens_decoder:
539+
if not tokenizer.added_tokens_decoder[i].normalized:
540+
previous_token = token
541+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
542+
if previous_token != token:
543+
logger.info(f"{repr(previous_token)} normalized to {repr(token)}")
544+
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
545+
toktypes.append(gguf.TokenType.CONTROL)
546+
else:
547+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # Normalize spaces
548+
toktypes.append(gguf.TokenType.USER_DEFINED)
549+
else:
550+
toktypes.append(gguf.TokenType.USER_DEFINED)
548551
else:
549-
# NOTE: this was added for Gemma.
550-
# Encoding and decoding the tokens above isn't sufficient for this case.
551-
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
552-
toktypes.append(gguf.TokenType.USER_DEFINED)
552+
toktypes.append(gguf.TokenType.NORMAL)
553+
tokens.append(token)
554+
555+
elif "TikTokenTokenizer" in type(tokenizer).__name__:
556+
# TikTokenTokenizer case
557+
vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size) # Use vocab_size attribute
558+
tokens = [tokenizer.decode([i]) for i in range(vocab_size)] # Decode token IDs to strings
559+
560+
# Handle special tokens
561+
special_tokens = tokenizer.special_tokens_map
562+
special_token_set = {v for val in special_tokens.values() for v in (val if isinstance(val, list) else [val])}
563+
564+
for i in range(vocab_size):
565+
token = tokens[i]
566+
if token in special_token_set or self.does_token_look_special(token):
567+
toktypes.append(gguf.TokenType.CONTROL)
568+
elif token.strip() == "" or token.startswith("[PAD") or token.startswith("<|PAD"):
569+
toktypes.append(gguf.TokenType.UNUSED)
553570
else:
554571
toktypes.append(gguf.TokenType.NORMAL)
555-
tokens.append(token)
556572

573+
else:
574+
raise ValueError(f"Unsupported tokenizer type: {type(tokenizer).__name__}")
575+
576+
tokpre = self.get_vocab_base_pre(tokenizer)
557577
return tokens, toktypes, tokpre
558578

559579
# NOTE: this function is generated by convert_hf_to_gguf_update.py
@@ -579,9 +599,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
579599
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
580600
# or pull the latest version of the model from Huggingface
581601
# don't edit the hashes manually!
582-
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
583-
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
584-
res = "llama-bpe"
585602
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
586603
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
587604
res = "deepseek-llm"
@@ -591,12 +608,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
591608
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
592609
# ref: https://huggingface.co/tiiuae/falcon-7b
593610
res = "falcon"
594-
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
595-
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
596-
res = "falcon3"
597611
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
598612
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
599613
res = "bert-bge"
614+
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
615+
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
616+
res = "falcon3"
600617
if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
601618
# ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
602619
res = "bert-bge-large"
@@ -624,9 +641,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
624641
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
625642
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
626643
res = "olmo"
627-
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
628-
# ref: https://huggingface.co/databricks/dbrx-base
629-
res = "dbrx"
630644
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
631645
# ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
632646
res = "jina-v1-en"
@@ -648,9 +662,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
648662
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
649663
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
650664
res = "jina-v2-code"
651-
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
652-
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
653-
res = "chatglm-bpe"
654665
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
655666
# ref: https://huggingface.co/LumiOpen/Viking-7B
656667
res = "viking"
@@ -678,10 +689,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
678689
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
679690
# ref: https://huggingface.co/microsoft/phi-2
680691
res = "phi-2"
681-
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
682-
# ref: https://huggingface.co/facebook/chameleon-7b
683-
res = "chameleon"
684-
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
692+
if chkhsh == "68fa7e0a33050885cc10a2acfa4df354042188f0afa03b809f7a71c4cde6e373":
685693
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
686694
res = "minerva-7b"
687695
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
@@ -699,6 +707,10 @@ def get_vocab_base_pre(self, tokenizer) -> str:
699707
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
700708
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
701709
res = "deepseek-r1-qwen"
710+
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
711+
# ref: https://huggingface.co/moonshotai/Moonlight-16B-A3B
712+
res = "moonlight-a3b"
713+
702714

703715
if res is None:
704716
logger.warning("\n")

0 commit comments

Comments
 (0)