Skip to content

Commit 8fd547b

Browse files
committed
failed token fix
1 parent 245db15 commit 8fd547b

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6416,18 +6416,21 @@ def set_vocab(self):
64166416

64176417
# 2. Reverse-engineer the merges list from mergeable_ranks
64186418
merges = []
6419+
vocab = {}
64196420
mergeable_ranks = tokenizer.mergeable_ranks
64206421
for token, rank in mergeable_ranks.items():
6422+
#vocab[QwenModel.token_bytes_to_string(token)] = rank
64216423
if len(token) == 1:
64226424
continue
64236425
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
6424-
if len(merged) == 2:
6426+
if len(merged) == 2: #todo this is an assert in Qwen, why?
64256427
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
64266428

64276429
# 3. Generate the tokens and toktypes lists
64286430
vocab_size = self.hparams["vocab_size"]
6429-
reverse_vocab = tokenizer.decoder
64306431
special_token_ids = set(tokenizer.special_tokens.values())
6432+
reverse_vocab = tokenizer.decoder
6433+
#reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_token_ids}.items()}
64316434
tokens: list[str] = []
64326435
toktypes: list[int] = []
64336436
for i in range(vocab_size):

0 commit comments

Comments
 (0)