Skip to content

Commit cb5d91b

Browse files
authored
add missing mask token
also correct obvious vocab padding error (most likely no actual change for any model out there, but at least makes sense now)
1 parent 259469c commit cb5d91b

File tree

1 file changed

+7
-9
lines changed

1 file changed

+7
-9
lines changed

convert_hf_to_gguf.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3665,7 +3665,7 @@ def _xlmroberta_set_vocab(self) -> None:
36653665
tokenizer = SentencePieceProcessor()
36663666
tokenizer.LoadFromFile(str(tokenizer_path))
36673667

3668-
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3668+
vocab_size = max(self.hparams.get('vocab_size', 0), tokenizer.vocab_size())
36693669

36703670
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
36713671
scores: list[float] = [-10000.0] * vocab_size
@@ -3690,14 +3690,6 @@ def _xlmroberta_set_vocab(self) -> None:
36903690
scores[token_id] = score
36913691
toktypes[token_id] = toktype
36923692

3693-
if vocab_size > len(tokens):
3694-
pad_count = vocab_size - len(tokens)
3695-
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3696-
for i in range(1, pad_count + 1):
3697-
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3698-
scores.append(-1000.0)
3699-
toktypes.append(SentencePieceTokenTypes.UNUSED)
3700-
37013693
# realign tokens (see HF tokenizer code)
37023694
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
37033695
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
@@ -3708,6 +3700,12 @@ def _xlmroberta_set_vocab(self) -> None:
37083700
SentencePieceTokenTypes.UNKNOWN,
37093701
] + toktypes[3:-1]
37103702

3703+
if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
3704+
# Add mask token missing from sentencepiece.bpe.model
3705+
tokens[250001] = "<mask>"
3706+
scores[250001] = 0.0
3707+
toktypes[250001] = SentencePieceTokenTypes.CONTROL
3708+
37113709
self.gguf_writer.add_tokenizer_model("t5")
37123710
self.gguf_writer.add_tokenizer_pre("default")
37133711
self.gguf_writer.add_token_list(tokens)

0 commit comments

Comments
 (0)