Skip to content

Commit 27a440e

Browse files
committed
fix tokenizer
1 parent f10dfd3 commit 27a440e

File tree

2 files changed

+113
-92
lines changed

2 files changed

+113
-92
lines changed

convert_hf_to_gguf.py

Lines changed: 113 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -695,8 +695,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
695695
if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
696696
# ref: https://huggingface.co/Xenova/gpt-4o
697697
res = "gpt-4o"
698-
if chkhsh == "a81863d07e75497e2194eb1a1574d5e5cd4d5f85a87a0728b922bf2bed6fb327":
699-
res = "bert"
700698

701699
if res is None:
702700
logger.warning("\n")
@@ -3088,6 +3086,97 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
30883086

30893087
return [(self.map_tensor_name(name), data_torch)]
30903088

3089+
def _xlmroberta_tokenizer_init(self) -> None:
3090+
# we need the pad_token_id to know how to chop down position_embd matrix
3091+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
3092+
self._position_offset = 1 + pad_token_id
3093+
if "max_position_embeddings" in self.hparams:
3094+
self.hparams["max_position_embeddings"] -= self._position_offset
3095+
else:
3096+
self._position_offset = None
3097+
3098+
def _xlmroberta_set_vocab(self) -> None:
3099+
# to avoid TypeError: Descriptors cannot be created directly
3100+
# exception when importing sentencepiece_model_pb2
3101+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3102+
from sentencepiece import SentencePieceProcessor
3103+
from sentencepiece import sentencepiece_model_pb2 as model
3104+
3105+
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
3106+
if not tokenizer_path.is_file():
3107+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
3108+
3109+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3110+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3111+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3112+
3113+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3114+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3115+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3116+
3117+
tokenizer = SentencePieceProcessor()
3118+
tokenizer.LoadFromFile(str(tokenizer_path))
3119+
3120+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3121+
3122+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3123+
scores: list[float] = [-10000.0] * vocab_size
3124+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3125+
3126+
for token_id in range(tokenizer.vocab_size()):
3127+
piece = tokenizer.IdToPiece(token_id)
3128+
text = piece.encode("utf-8")
3129+
score = tokenizer.GetScore(token_id)
3130+
3131+
toktype = SentencePieceTokenTypes.NORMAL
3132+
if tokenizer.IsUnknown(token_id):
3133+
toktype = SentencePieceTokenTypes.UNKNOWN
3134+
elif tokenizer.IsControl(token_id):
3135+
toktype = SentencePieceTokenTypes.CONTROL
3136+
elif tokenizer.IsUnused(token_id):
3137+
toktype = SentencePieceTokenTypes.UNUSED
3138+
elif tokenizer.IsByte(token_id):
3139+
toktype = SentencePieceTokenTypes.BYTE
3140+
3141+
tokens[token_id] = text
3142+
scores[token_id] = score
3143+
toktypes[token_id] = toktype
3144+
3145+
if vocab_size > len(tokens):
3146+
pad_count = vocab_size - len(tokens)
3147+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3148+
for i in range(1, pad_count + 1):
3149+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3150+
scores.append(-1000.0)
3151+
toktypes.append(SentencePieceTokenTypes.UNUSED)
3152+
3153+
# realign tokens (see HF tokenizer code)
3154+
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
3155+
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
3156+
toktypes = [
3157+
SentencePieceTokenTypes.CONTROL,
3158+
SentencePieceTokenTypes.CONTROL,
3159+
SentencePieceTokenTypes.CONTROL,
3160+
SentencePieceTokenTypes.UNKNOWN,
3161+
] + toktypes[3:-1]
3162+
3163+
self.gguf_writer.add_tokenizer_model("t5")
3164+
self.gguf_writer.add_tokenizer_pre("default")
3165+
self.gguf_writer.add_token_list(tokens)
3166+
self.gguf_writer.add_token_scores(scores)
3167+
self.gguf_writer.add_token_types(toktypes)
3168+
self.gguf_writer.add_add_space_prefix(add_prefix)
3169+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3170+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3171+
if precompiled_charsmap:
3172+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3173+
3174+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3175+
special_vocab.add_to_gguf(self.gguf_writer)
3176+
3177+
self.gguf_writer.add_add_bos_token(True)
3178+
self.gguf_writer.add_add_eos_token(True)
3179+
30913180

30923181
@Model.register("RobertaModel")
30933182
class RobertaModel(BertModel):
@@ -3154,6 +3243,10 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
31543243
eager, metadata_override, model_name, split_max_tensors,
31553244
split_max_size, dry_run, small_first_shard, hparams)
31563245

3246+
self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
3247+
if self._tokenizer_is_xlmroberta:
3248+
self._xlmroberta_tokenizer_init()
3249+
31573250
# the HF config claims n_ctx=8192, but it uses RoPE scaling
31583251
self.hparams["n_ctx"] = 2048
31593252

@@ -3181,6 +3274,21 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
31813274
assert self.hparams["rotary_emb_interleaved"] is False
31823275
assert self.hparams["rotary_emb_scale_base"] is None
31833276

3277+
def _is_tokenizer_xlmroberta(self) -> bool:
3278+
with open(self.dir_model / "tokenizer.json") as f:
3279+
tokenizer_json = json.load(f)
3280+
toktyp = tokenizer_json["model"]["type"]
3281+
if toktyp == "Unigram":
3282+
return True
3283+
if toktyp == "WordPiece":
3284+
return False
3285+
raise ValueError(f"unknown tokenizer: {toktyp}")
3286+
3287+
def set_vocab(self) -> None:
3288+
if self._tokenizer_is_xlmroberta:
3289+
return self._xlmroberta_set_vocab()
3290+
return super().set_vocab()
3291+
31843292
def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
31853293
# If the tensor is an experts bias tensor, skip it by returning an empty list.
31863294
if "mlp.experts.bias" in name:
@@ -3212,96 +3320,10 @@ class XLMRobertaModel(BertModel):
32123320

32133321
def __init__(self, *args, **kwargs):
32143322
super().__init__(*args, **kwargs)
3323+
self._xlmroberta_tokenizer_init()
32153324

3216-
# we need the pad_token_id to know how to chop down position_embd matrix
3217-
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
3218-
self._position_offset = 1 + pad_token_id
3219-
if "max_position_embeddings" in self.hparams:
3220-
self.hparams["max_position_embeddings"] -= self._position_offset
3221-
else:
3222-
self._position_offset = None
3223-
3224-
def set_vocab(self):
3225-
# to avoid TypeError: Descriptors cannot be created directly
3226-
# exception when importing sentencepiece_model_pb2
3227-
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3228-
from sentencepiece import SentencePieceProcessor
3229-
from sentencepiece import sentencepiece_model_pb2 as model
3230-
3231-
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
3232-
if not tokenizer_path.is_file():
3233-
raise FileNotFoundError(f"File not found: {tokenizer_path}")
3234-
3235-
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3236-
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3237-
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3238-
3239-
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3240-
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3241-
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3242-
3243-
tokenizer = SentencePieceProcessor()
3244-
tokenizer.LoadFromFile(str(tokenizer_path))
3245-
3246-
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3247-
3248-
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3249-
scores: list[float] = [-10000.0] * vocab_size
3250-
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3251-
3252-
for token_id in range(tokenizer.vocab_size()):
3253-
piece = tokenizer.IdToPiece(token_id)
3254-
text = piece.encode("utf-8")
3255-
score = tokenizer.GetScore(token_id)
3256-
3257-
toktype = SentencePieceTokenTypes.NORMAL
3258-
if tokenizer.IsUnknown(token_id):
3259-
toktype = SentencePieceTokenTypes.UNKNOWN
3260-
elif tokenizer.IsControl(token_id):
3261-
toktype = SentencePieceTokenTypes.CONTROL
3262-
elif tokenizer.IsUnused(token_id):
3263-
toktype = SentencePieceTokenTypes.UNUSED
3264-
elif tokenizer.IsByte(token_id):
3265-
toktype = SentencePieceTokenTypes.BYTE
3266-
3267-
tokens[token_id] = text
3268-
scores[token_id] = score
3269-
toktypes[token_id] = toktype
3270-
3271-
if vocab_size > len(tokens):
3272-
pad_count = vocab_size - len(tokens)
3273-
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3274-
for i in range(1, pad_count + 1):
3275-
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3276-
scores.append(-1000.0)
3277-
toktypes.append(SentencePieceTokenTypes.UNUSED)
3278-
3279-
# realign tokens (see HF tokenizer code)
3280-
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
3281-
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
3282-
toktypes = [
3283-
SentencePieceTokenTypes.CONTROL,
3284-
SentencePieceTokenTypes.CONTROL,
3285-
SentencePieceTokenTypes.CONTROL,
3286-
SentencePieceTokenTypes.UNKNOWN,
3287-
] + toktypes[3:-1]
3288-
3289-
self.gguf_writer.add_tokenizer_model("t5")
3290-
self.gguf_writer.add_tokenizer_pre("default")
3291-
self.gguf_writer.add_token_list(tokens)
3292-
self.gguf_writer.add_token_scores(scores)
3293-
self.gguf_writer.add_token_types(toktypes)
3294-
self.gguf_writer.add_add_space_prefix(add_prefix)
3295-
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3296-
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3297-
if precompiled_charsmap:
3298-
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3299-
3300-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3301-
special_vocab.add_to_gguf(self.gguf_writer)
3302-
3303-
self.gguf_writer.add_add_bos_token(True)
3304-
self.gguf_writer.add_add_eos_token(True)
3325+
def set_vocab(self) -> None:
3326+
self._xlmroberta_set_vocab()
33053327

33063328
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
33073329
# if name starts with "roberta.", remove the prefix

convert_hf_to_gguf_update.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@ class TOKENIZER_TYPE(IntEnum):
110110
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
111111
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
112112
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
113-
{"name": "nomic-embed-text-v2-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe", },
114113
]
115114

116115

0 commit comments

Comments
 (0)