Skip to content

Commit 38acf7f

Browse files
committed
tokenizer ok
1 parent f5d8a22 commit 38acf7f

File tree

3 files changed

+18
-19
lines changed

3 files changed

+18
-19
lines changed

convert_hf_to_gguf.py

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6404,31 +6404,24 @@ def __init__(self, *args, **kwargs):
64046404
def set_vocab(self):
64056405
self._set_vocab_gpt2()
64066406

6407-
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
6408-
import base64
6409-
dic = {}
6410-
rank = 0
6411-
for line in open(tiktoken_bpe_file, "rb"):
6412-
if line:
6413-
token, _ = line.split()
6414-
if base64.b64decode(token) in dic:
6415-
continue
6416-
dic[base64.b64decode(token)] = int(rank)
6417-
rank += 1
6418-
global SPECIAL_START_ID
6419-
SPECIAL_START_ID=rank
6420-
return dic
6421-
64226407
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
64236408
tokens: list[str] = []
64246409
toktypes: list[int] = []
64256410

64266411
from transformers import AutoTokenizer
64276412
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
6428-
print(tokenizer)
6429-
print(tokenizer.tokenizer)
6430-
print(type(tokenizer.decoder))
6431-
# exit(0)
6413+
6414+
merges = []
6415+
vocab = {}
6416+
mergeable_ranks = tokenizer.mergeable_ranks
6417+
for token, rank in mergeable_ranks.items():
6418+
vocab[QwenModel.token_bytes_to_string(token)] = rank
6419+
if len(token) == 1:
6420+
continue
6421+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
6422+
if len(merged) == 2:
6423+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
6424+
self.gguf_writer.add_token_merges(merges)
64326425

64336426
reverse_vocab = tokenizer.decoder
64346427
assert max(reverse_vocab.keys()) < tokenizer.vocab_size

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ extern "C" {
117117
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
118118
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
119119
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
120+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
120121
};
121122

122123
enum llama_rope_type {

src/llama-vocab.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
351351
break;
352352
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
353353
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
354+
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
354355
regex_exprs = {
355356
// original regex from tokenizer.json
356357
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -1656,6 +1657,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
16561657
tokenizer_pre == "seed-coder") {
16571658
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
16581659
clean_spaces = false;
1660+
} else if (
1661+
tokenizer_pre == "hunyuan") {
1662+
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1663+
clean_spaces = false;
16591664
} else {
16601665
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
16611666
}

0 commit comments

Comments
 (0)