@@ -515,45 +515,65 @@ def does_token_look_special(self, token: str | bytes) -> bool:
515515
516516 # used for GPT-2 BPE and WordPiece vocabs
517517 def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
518- tokens : list [str ] = []
519- toktypes : list [int ] = []
520-
521518 from transformers import AutoTokenizer
522- tokenizer = AutoTokenizer .from_pretrained (self .dir_model )
523- vocab_size = self .hparams .get ("vocab_size" , len (tokenizer .vocab ))
524- assert max (tokenizer .vocab .values ()) < vocab_size
519+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
525520
526- tokpre = self .get_vocab_base_pre (tokenizer )
527-
528- reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in tokenizer .vocab .items ()}
529- added_vocab = tokenizer .get_added_vocab ()
521+ tokens : list [str ] = []
522+ toktypes : list [int ] = []
530523
531- for i in range (vocab_size ):
532- if i not in reverse_vocab :
533- tokens .append (f"[PAD{ i } ]" )
534- toktypes .append (gguf .TokenType .UNUSED )
535- else :
536- token : str = reverse_vocab [i ]
537- if token in added_vocab :
538- # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
539- # To avoid unexpected issues - we make sure to normalize non-normalized tokens
540- if not tokenizer .added_tokens_decoder [i ].normalized :
541- previous_token = token
542- token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
543- if previous_token != token :
544- logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
545-
546- if tokenizer .added_tokens_decoder [i ].special or self .does_token_look_special (token ):
547- toktypes .append (gguf .TokenType .CONTROL )
524+ if hasattr (tokenizer , "vocab" ):
525+ # Standard Hugging Face tokenizer (e.g., GPT-2, BERT)
526+ vocab_size = self .hparams .get ("vocab_size" , len (tokenizer .vocab ))
527+ reverse_vocab = {id_ : tok for tok , id_ in tokenizer .vocab .items ()}
528+ assert max (tokenizer .vocab .values ()) < vocab_size , "Vocab IDs exceed vocab_size"
529+ added_vocab = tokenizer .get_added_vocab ()
530+
531+ for i in range (vocab_size ):
532+ if i not in reverse_vocab :
533+ tokens .append (f"[PAD{ i } ]" )
534+ toktypes .append (gguf .TokenType .UNUSED )
535+ else :
536+ token = reverse_vocab [i ]
537+ if token in added_vocab :
538+ if hasattr (tokenizer , "added_tokens_decoder" ) and i in tokenizer .added_tokens_decoder :
539+ if not tokenizer .added_tokens_decoder [i ].normalized :
540+ previous_token = token
541+ token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
542+ if previous_token != token :
543+ logger .info (f"{ repr (previous_token )} normalized to { repr (token )} " )
544+ if tokenizer .added_tokens_decoder [i ].special or self .does_token_look_special (token ):
545+ toktypes .append (gguf .TokenType .CONTROL )
546+ else :
547+ token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # Normalize spaces
548+ toktypes .append (gguf .TokenType .USER_DEFINED )
549+ else :
550+ toktypes .append (gguf .TokenType .USER_DEFINED )
548551 else :
549- # NOTE: this was added for Gemma.
550- # Encoding and decoding the tokens above isn't sufficient for this case.
551- token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
552- toktypes .append (gguf .TokenType .USER_DEFINED )
552+ toktypes .append (gguf .TokenType .NORMAL )
553+ tokens .append (token )
554+
555+ elif "TikTokenTokenizer" in type (tokenizer ).__name__ :
556+ # TikTokenTokenizer case
557+ vocab_size = self .hparams .get ("vocab_size" , tokenizer .vocab_size ) # Use vocab_size attribute
558+ tokens = [tokenizer .decode ([i ]) for i in range (vocab_size )] # Decode token IDs to strings
559+
560+ # Handle special tokens
561+ special_tokens = tokenizer .special_tokens_map
562+ special_token_set = {v for val in special_tokens .values () for v in (val if isinstance (val , list ) else [val ])}
563+
564+ for i in range (vocab_size ):
565+ token = tokens [i ]
566+ if token in special_token_set or self .does_token_look_special (token ):
567+ toktypes .append (gguf .TokenType .CONTROL )
568+ elif token .strip () == "" or token .startswith ("[PAD" ) or token .startswith ("<|PAD" ):
569+ toktypes .append (gguf .TokenType .UNUSED )
553570 else :
554571 toktypes .append (gguf .TokenType .NORMAL )
555- tokens .append (token )
556572
573+ else :
574+ raise ValueError (f"Unsupported tokenizer type: { type (tokenizer ).__name__ } " )
575+
576+ tokpre = self .get_vocab_base_pre (tokenizer )
557577 return tokens , toktypes , tokpre
558578
559579 # NOTE: this function is generated by convert_hf_to_gguf_update.py
@@ -579,9 +599,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
579599 # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
580600 # or pull the latest version of the model from Huggingface
581601 # don't edit the hashes manually!
582- if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5" :
583- # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
584- res = "llama-bpe"
585602 if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754" :
586603 # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
587604 res = "deepseek-llm"
@@ -591,12 +608,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
591608 if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed" :
592609 # ref: https://huggingface.co/tiiuae/falcon-7b
593610 res = "falcon"
594- if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e" :
595- # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
596- res = "falcon3"
597611 if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f" :
598612 # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
599613 res = "bert-bge"
614+ if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e" :
615+ # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
616+ res = "falcon3"
600617 if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7" :
601618 # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
602619 res = "bert-bge-large"
@@ -624,9 +641,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
624641 if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166" :
625642 # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
626643 res = "olmo"
627- if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e" :
628- # ref: https://huggingface.co/databricks/dbrx-base
629- res = "dbrx"
630644 if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448" :
631645 # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
632646 res = "jina-v1-en"
@@ -648,9 +662,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
648662 if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a" :
649663 # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
650664 res = "jina-v2-code"
651- if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" :
652- # ref: https://huggingface.co/THUDM/glm-4-9b-chat
653- res = "chatglm-bpe"
654665 if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee" :
655666 # ref: https://huggingface.co/LumiOpen/Viking-7B
656667 res = "viking"
@@ -678,10 +689,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
678689 if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085" :
679690 # ref: https://huggingface.co/microsoft/phi-2
680691 res = "phi-2"
681- if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450" :
682- # ref: https://huggingface.co/facebook/chameleon-7b
683- res = "chameleon"
684- if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35" :
692+ if chkhsh == "68fa7e0a33050885cc10a2acfa4df354042188f0afa03b809f7a71c4cde6e373" :
685693 # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
686694 res = "minerva-7b"
687695 if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65" :
@@ -699,6 +707,10 @@ def get_vocab_base_pre(self, tokenizer) -> str:
699707 if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5" :
700708 # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
701709 res = "deepseek-r1-qwen"
710+ if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890" :
711+ # ref: https://huggingface.co/moonshotai/Moonlight-16B-A3B
712+ res = "moonlight-a3b"
713+
702714
703715 if res is None :
704716 logger .warning ("\n " )
0 commit comments