File tree Expand file tree Collapse file tree 1 file changed +5
-2
lines changed Expand file tree Collapse file tree 1 file changed +5
-2
lines changed Original file line number Diff line number Diff line change @@ -6416,18 +6416,21 @@ def set_vocab(self):
64166416
64176417 # 2. Reverse-engineer the merges list from mergeable_ranks
64186418 merges = []
6419+ vocab = {}
64196420 mergeable_ranks = tokenizer .mergeable_ranks
64206421 for token , rank in mergeable_ranks .items ():
6422+ #vocab[QwenModel.token_bytes_to_string(token)] = rank
64216423 if len (token ) == 1 :
64226424 continue
64236425 merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
6424- if len (merged ) == 2 :
6426+ if len (merged ) == 2 : #todo this is an assert in Qwen, why?
64256427 merges .append (' ' .join (map (QwenModel .token_bytes_to_string , merged )))
64266428
64276429 # 3. Generate the tokens and toktypes lists
64286430 vocab_size = self .hparams ["vocab_size" ]
6429- reverse_vocab = tokenizer .decoder
64306431 special_token_ids = set (tokenizer .special_tokens .values ())
6432+ reverse_vocab = tokenizer .decoder
6433+ #reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_token_ids}.items()}
64316434 tokens : list [str ] = []
64326435 toktypes : list [int ] = []
64336436 for i in range (vocab_size ):
You can’t perform that action at this time.
0 commit comments