We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent f87ac1c commit 7b0b2eaCopy full SHA for 7b0b2ea
convert_hf_to_gguf.py
@@ -3557,6 +3557,15 @@ def set_vocab(self):
3557
else:
3558
toktypes.append(gguf.TokenType.NORMAL)
3559
3560
+ vocab_size = self.hparams["vocab_size"]
3561
+ if vocab_size > len(tokens):
3562
+ pad_count = vocab_size - len(tokens)
3563
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3564
+ for i in range(1, pad_count + 1):
3565
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3566
+ scores.append(-1000.0)
3567
+ toktypes.append(gguf.TokenType.UNUSED)
3568
+
3569
# Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
3570
self.gguf_writer.add_tokenizer_model("plamo2")
3571
self.gguf_writer.add_tokenizer_pre("default")
0 commit comments