@@ -2656,13 +2656,17 @@ def set_vocab(self):
26562656 def decode_grok_token (token : dict , toktype : gguf .TokenType ) -> tuple [gguf .TokenType , int , str ]:
26572657 tokid : int = token ["token" ]
26582658 tokb : list [int ] = token ["bytes" ]
2659- try :
2660- tokc = bytes (tokb ).decode ("utf-8" )
2661- except Exception :
2662- tokc = None
2663- if len (tokb ) == 1 or tokc is None :
2659+ if len (tokb ) == 1 :
26642660 return gguf .TokenType .BYTE , tokid , "<0x{:02X}>" .format (tokb [0 ])
26652661 else :
2662+ try :
2663+ tokc = bytes (tokb ).decode ("utf-8" )
2664+ except Exception :
2665+ tokc = None
2666+ if tokc is None or not all (tokb ):
2667+ # Incomplete UTF-8 sequence or \0 bytes, escape it
2668+ # probably doesn't tokenize correctly, but at least won't crash
2669+ tokc = repr (bytes (tokb ))[2 :- 1 ]
26662670 return toktype , tokid , tokc
26672671
26682672 for token in tokenizer ["special_tokens" ]:
@@ -2676,8 +2680,11 @@ def decode_grok_token(token: dict, toktype: gguf.TokenType) -> tuple[gguf.TokenT
26762680 toktype , tokid , tokc = decode_grok_token (token , gguf .TokenType .NORMAL )
26772681 tokens [tokid ] = tokc
26782682 toktypes [tokid ] = toktype
2679- scores [tokid ] = score
2680- score -= 1.0
2683+ if toktype == gguf .TokenType .BYTE :
2684+ scores [tokid ] = 0.0
2685+ else :
2686+ scores [tokid ] = score
2687+ score -= 1.0
26812688
26822689 self .gguf_writer .add_tokenizer_model ("llama" )
26832690 self .gguf_writer .add_tokenizer_pre ("default" )
0 commit comments