@@ -6404,31 +6404,24 @@ def __init__(self, *args, **kwargs):
64046404 def set_vocab (self ):
64056405 self ._set_vocab_gpt2 ()
64066406
6407- def _load_tiktoken_bpe (tiktoken_bpe_file : str ) -> Dict [bytes , int ]:
6408- import base64
6409- dic = {}
6410- rank = 0
6411- for line in open (tiktoken_bpe_file , "rb" ):
6412- if line :
6413- token , _ = line .split ()
6414- if base64 .b64decode (token ) in dic :
6415- continue
6416- dic [base64 .b64decode (token )] = int (rank )
6417- rank += 1
6418- global SPECIAL_START_ID
6419- SPECIAL_START_ID = rank
6420- return dic
6421-
64226407 def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
64236408 tokens : list [str ] = []
64246409 toktypes : list [int ] = []
64256410
64266411 from transformers import AutoTokenizer
64276412 tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
6428- print (tokenizer )
6429- print (tokenizer .tokenizer )
6430- print (type (tokenizer .decoder ))
6431- # exit(0)
6413+
6414+ merges = []
6415+ vocab = {}
6416+ mergeable_ranks = tokenizer .mergeable_ranks
6417+ for token , rank in mergeable_ranks .items ():
6418+ vocab [QwenModel .token_bytes_to_string (token )] = rank
6419+ if len (token ) == 1 :
6420+ continue
6421+ merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
6422+ if len (merged ) == 2 :
6423+ merges .append (' ' .join (map (QwenModel .token_bytes_to_string , merged )))
6424+ self .gguf_writer .add_token_merges (merges )
64326425
64336426 reverse_vocab = tokenizer .decoder
64346427 assert max (reverse_vocab .keys ()) < tokenizer .vocab_size
0 commit comments