@@ -2647,68 +2647,11 @@ def set_vocab(self):
26472647 self ._set_vocab_sentencepiece ()
26482648 return
26492649
2650- if (self .dir_model / 'tokenizer.json' ).is_file ():
2651- self ._set_vocab_gpt2 ()
2652- return
2653-
2654- tokenizer_path = self .dir_model / 'tokenizer.tok.json'
2655- with open (tokenizer_path , "r" , encoding = "utf-8" ) as f :
2656- tokenizer = json .load (f )
2657-
2658- vocab_size = tokenizer ["vocab_size" ]
2659- tokens : list [str ] = [f"[PAD{ i } ]" for i in range (vocab_size )]
2660- scores : list [float ] = [- 10000.0 ] * vocab_size
2661- toktypes : list [int ] = [gguf .TokenType .UNUSED ] * vocab_size
2662-
2663- def decode_grok_token (token : dict , toktype : gguf .TokenType ) -> tuple [gguf .TokenType , int , str ]:
2664- tokid : int = token ["token" ]
2665- tokb : list [int ] = token ["bytes" ]
2666- if tokb == [32 ]:
2667- tokb = [0xe2 , 0x96 , 0x81 ]
2668- if len (tokb ) == 1 :
2669- return gguf .TokenType .BYTE , tokid , "<0x{:02X}>" .format (tokb [0 ])
2670- else :
2671- try :
2672- tokc = bytes (tokb ).decode ("utf-8" ).replace (" " , "▁" )
2673- except Exception :
2674- tokc = None
2675- if tokc is None or not all (tokb ):
2676- # Incomplete UTF-8 sequence or \0 bytes, escape it
2677- # probably doesn't tokenize correctly, but at least won't crash
2678- tokc = repr (bytes (tokb ))[2 :- 1 ]
2679- return toktype , tokid , tokc
2680-
2681- for token in tokenizer ["special_tokens" ]:
2682- toktype , tokid , tokc = decode_grok_token (token , gguf .TokenType .CONTROL )
2683- tokens [tokid ] = tokc
2684- toktypes [tokid ] = toktype
2685- scores [tokid ] = 0.0
2686-
2687- score = - 0.0
2688- for token in tokenizer ["regular_tokens" ]:
2689- toktype , tokid , tokc = decode_grok_token (token , gguf .TokenType .NORMAL )
2690- tokens [tokid ] = tokc
2691- toktypes [tokid ] = toktype
2692- if toktype == gguf .TokenType .BYTE :
2693- scores [tokid ] = 0.0
2694- else :
2695- scores [tokid ] = score
2696- score -= 1.0
2697-
2698- self .gguf_writer .add_tokenizer_model ("llama" )
2699- self .gguf_writer .add_tokenizer_pre ("default" )
2700- self .gguf_writer .add_token_list (tokens )
2701- self .gguf_writer .add_token_scores (scores )
2702- self .gguf_writer .add_token_types (toktypes )
2703-
2704- self .gguf_writer .add_add_bos_token (False )
2650+ if not (self .dir_model / 'tokenizer.json' ).is_file () or not (self .dir_model / 'chat_template.jinja' ).is_file ():
2651+ logger .error ('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer' )
2652+ sys .exit (1 )
27052653
2706- special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2707- special_vocab .special_token_ids ["pad" ] = 0
2708- special_vocab .special_token_ids ["sep" ] = 1
2709- special_vocab .special_token_ids ["eos" ] = 2
2710- special_vocab .chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n \n ' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n \n ' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n \n ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
2711- special_vocab .add_to_gguf (self .gguf_writer )
2654+ self ._set_vocab_gpt2 ()
27122655
27132656 def __init__ (self , * args , ** kwargs ):
27142657 super ().__init__ (* args , ** kwargs )
0 commit comments