Skip to content

Commit 711ab17

Browse files
authored
"fix" vocab for invalid sequences
1 parent f582b84 commit 711ab17

File tree

1 file changed

+14
-7
lines changed

1 file changed

+14
-7
lines changed

convert_hf_to_gguf.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2656,13 +2656,17 @@ def set_vocab(self):
26562656
def decode_grok_token(token: dict, toktype: gguf.TokenType) -> tuple[gguf.TokenType, int, str]:
26572657
tokid: int = token["token"]
26582658
tokb: list[int] = token["bytes"]
2659-
try:
2660-
tokc = bytes(tokb).decode("utf-8")
2661-
except Exception:
2662-
tokc = None
2663-
if len(tokb) == 1 or tokc is None:
2659+
if len(tokb) == 1:
26642660
return gguf.TokenType.BYTE, tokid, "<0x{:02X}>".format(tokb[0])
26652661
else:
2662+
try:
2663+
tokc = bytes(tokb).decode("utf-8")
2664+
except Exception:
2665+
tokc = None
2666+
if tokc is None or not all(tokb):
2667+
# Incomplete UTF-8 sequence or \0 bytes, escape it
2668+
# probably doesn't tokenize correctly, but at least won't crash
2669+
tokc = repr(bytes(tokb))[2:-1]
26662670
return toktype, tokid, tokc
26672671

26682672
for token in tokenizer["special_tokens"]:
@@ -2676,8 +2680,11 @@ def decode_grok_token(token: dict, toktype: gguf.TokenType) -> tuple[gguf.TokenT
26762680
toktype, tokid, tokc = decode_grok_token(token, gguf.TokenType.NORMAL)
26772681
tokens[tokid] = tokc
26782682
toktypes[tokid] = toktype
2679-
scores[tokid] = score
2680-
score -= 1.0
2683+
if toktype == gguf.TokenType.BYTE:
2684+
scores[tokid] = 0.0
2685+
else:
2686+
scores[tokid] = score
2687+
score -= 1.0
26812688

26822689
self.gguf_writer.add_tokenizer_model("llama")
26832690
self.gguf_writer.add_tokenizer_pre("default")

0 commit comments

Comments
 (0)