Skip to content

Commit 7789616

Browse files
authored
apply correct padding with AutoTokenizer as well
1 parent 958eea6 commit 7789616

File tree

1 file changed

+19
-19
lines changed

1 file changed

+19
-19
lines changed

convert_hf_to_gguf.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3807,7 +3807,7 @@ def _xlmroberta_set_vocab(self) -> None:
38073807
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
38083808
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
38093809

3810-
vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
3810+
vocab_size = max(self.hparams.get('vocab_size', 0), tokenizer.vocab_size)
38113811
else:
38123812
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
38133813
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@@ -3851,24 +3851,24 @@ def _xlmroberta_set_vocab(self) -> None:
38513851
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
38523852

38533853
for token_id in range(vocab_size):
3854-
piece = tokenizer._convert_id_to_token(token_id)
3855-
text = piece.encode("utf-8")
3856-
score = tokenizer_json["model"]["vocab"][token_id][1]
3857-
3858-
toktype = SentencePieceTokenTypes.NORMAL
3859-
if token_id == unk_token_id:
3860-
toktype = SentencePieceTokenTypes.UNKNOWN
3861-
elif token_id in tokenizer.all_special_ids:
3862-
toktype = SentencePieceTokenTypes.CONTROL
3863-
elif token_id in added_vocab.values():
3864-
toktype = SentencePieceTokenTypes.USER_DEFINED
3865-
# No reliable way to detect this, but jina doesn't have any
3866-
# elif tokenizer.IsByte(token_id):
3867-
# toktype = SentencePieceTokenTypes.BYTE
3868-
3869-
tokens[token_id] = text
3870-
scores[token_id] = score
3871-
toktypes[token_id] = toktype
3854+
if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
3855+
text = piece.encode("utf-8")
3856+
score = tokenizer_json["model"]["vocab"][token_id][1]
3857+
3858+
toktype = SentencePieceTokenTypes.NORMAL
3859+
if token_id == unk_token_id:
3860+
toktype = SentencePieceTokenTypes.UNKNOWN
3861+
elif token_id in tokenizer.all_special_ids:
3862+
toktype = SentencePieceTokenTypes.CONTROL
3863+
elif token_id in added_vocab.values():
3864+
toktype = SentencePieceTokenTypes.USER_DEFINED
3865+
# No reliable way to detect this, but jina doesn't have any
3866+
# elif tokenizer.IsByte(token_id):
3867+
# toktype = SentencePieceTokenTypes.BYTE
3868+
3869+
tokens[token_id] = text
3870+
scores[token_id] = score
3871+
toktypes[token_id] = toktype
38723872

38733873
if isinstance(tokenizer, SentencePieceProcessor):
38743874
# realign tokens (see HF tokenizer code)

0 commit comments

Comments
 (0)