File tree Expand file tree Collapse file tree 1 file changed +10
-1
lines changed Expand file tree Collapse file tree 1 file changed +10
-1
lines changed Original file line number Diff line number Diff line change @@ -5605,8 +5605,17 @@ def set_vocab(self):
56055605 print (f'tokenizer.vocab_size = { tokenizer .vocab_size } ' , flush = True )
56065606 print ("====================================" , flush = True )
56075607
5608- assert tokenizer . vocab_size == vocab_size
5608+ # Kimi-K2 adds 2 extra reserved tokens which is incorrect
56095609 special_tokens = tokenizer .special_tokens
5610+ tokenizer_vocab_size = tokenizer .vocab_size
5611+ bad_tokens = ["<|reserved_token_163840|>" , "<|reserved_token_163841|>" ]
5612+ for bad_token in bad_tokens :
5613+ if bad_token in special_tokens and special_tokens [bad_token ] >= vocab_size :
5614+ print (f"Removing bad reserved token = { bad_token } " )
5615+ special_tokens .pop (bad_token )
5616+ tokenizer_vocab_size -= 1
5617+
5618+ assert tokenizer_vocab_size == vocab_size
56105619 reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** special_tokens }.items ()}
56115620 tokens : list [str ] = []
56125621 toktypes : list [int ] = []
You can’t perform that action at this time.
0 commit comments