Skip to content

Commit 705f84a

Browse files
authored
remove spm vocab in favor of community bpe vocab
1 parent 4abde12 commit 705f84a

File tree

1 file changed

+4
-61
lines changed

1 file changed

+4
-61
lines changed

convert_hf_to_gguf.py

Lines changed: 4 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -2647,68 +2647,11 @@ def set_vocab(self):
26472647
self._set_vocab_sentencepiece()
26482648
return
26492649

2650-
if (self.dir_model / 'tokenizer.json').is_file():
2651-
self._set_vocab_gpt2()
2652-
return
2653-
2654-
tokenizer_path = self.dir_model / 'tokenizer.tok.json'
2655-
with open(tokenizer_path, "r", encoding="utf-8") as f:
2656-
tokenizer = json.load(f)
2657-
2658-
vocab_size = tokenizer["vocab_size"]
2659-
tokens: list[str] = [f"[PAD{i}]" for i in range(vocab_size)]
2660-
scores: list[float] = [-10000.0] * vocab_size
2661-
toktypes: list[int] = [gguf.TokenType.UNUSED] * vocab_size
2662-
2663-
def decode_grok_token(token: dict, toktype: gguf.TokenType) -> tuple[gguf.TokenType, int, str]:
2664-
tokid: int = token["token"]
2665-
tokb: list[int] = token["bytes"]
2666-
if tokb == [32]:
2667-
tokb = [0xe2, 0x96, 0x81]
2668-
if len(tokb) == 1:
2669-
return gguf.TokenType.BYTE, tokid, "<0x{:02X}>".format(tokb[0])
2670-
else:
2671-
try:
2672-
tokc = bytes(tokb).decode("utf-8").replace(" ", "▁")
2673-
except Exception:
2674-
tokc = None
2675-
if tokc is None or not all(tokb):
2676-
# Incomplete UTF-8 sequence or \0 bytes, escape it
2677-
# probably doesn't tokenize correctly, but at least won't crash
2678-
tokc = repr(bytes(tokb))[2:-1]
2679-
return toktype, tokid, tokc
2680-
2681-
for token in tokenizer["special_tokens"]:
2682-
toktype, tokid, tokc = decode_grok_token(token, gguf.TokenType.CONTROL)
2683-
tokens[tokid] = tokc
2684-
toktypes[tokid] = toktype
2685-
scores[tokid] = 0.0
2686-
2687-
score = -0.0
2688-
for token in tokenizer["regular_tokens"]:
2689-
toktype, tokid, tokc = decode_grok_token(token, gguf.TokenType.NORMAL)
2690-
tokens[tokid] = tokc
2691-
toktypes[tokid] = toktype
2692-
if toktype == gguf.TokenType.BYTE:
2693-
scores[tokid] = 0.0
2694-
else:
2695-
scores[tokid] = score
2696-
score -= 1.0
2697-
2698-
self.gguf_writer.add_tokenizer_model("llama")
2699-
self.gguf_writer.add_tokenizer_pre("default")
2700-
self.gguf_writer.add_token_list(tokens)
2701-
self.gguf_writer.add_token_scores(scores)
2702-
self.gguf_writer.add_token_types(toktypes)
2703-
2704-
self.gguf_writer.add_add_bos_token(False)
2650+
if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file():
2651+
logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer')
2652+
sys.exit(1)
27052653

2706-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2707-
special_vocab.special_token_ids["pad"] = 0
2708-
special_vocab.special_token_ids["sep"] = 1
2709-
special_vocab.special_token_ids["eos"] = 2
2710-
special_vocab.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
2711-
special_vocab.add_to_gguf(self.gguf_writer)
2654+
self._set_vocab_gpt2()
27122655

27132656
def __init__(self, *args, **kwargs):
27142657
super().__init__(*args, **kwargs)

0 commit comments

Comments
 (0)