Skip to content

Commit 7b0b2ea

Browse files
mitmulCISC
andauthored
Update convert_hf_to_gguf.py
Co-authored-by: Sigbjørn Skjæret <[email protected]>
1 parent f87ac1c commit 7b0b2ea

File tree

1 file changed

+9
-0
lines changed

1 file changed

+9
-0
lines changed

convert_hf_to_gguf.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3557,6 +3557,15 @@ def set_vocab(self):
35573557
else:
35583558
toktypes.append(gguf.TokenType.NORMAL)
35593559

3560+
vocab_size = self.hparams["vocab_size"]
3561+
if vocab_size > len(tokens):
3562+
pad_count = vocab_size - len(tokens)
3563+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3564+
for i in range(1, pad_count + 1):
3565+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3566+
scores.append(-1000.0)
3567+
toktypes.append(gguf.TokenType.UNUSED)
3568+
35603569
# Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
35613570
self.gguf_writer.add_tokenizer_model("plamo2")
35623571
self.gguf_writer.add_tokenizer_pre("default")

0 commit comments

Comments
 (0)