Skip to content

Commit 37888ba

Browse files
committed
fix comment
1 parent 23c7da7 commit 37888ba

File tree

1 file changed

+48
-126
lines changed

1 file changed

+48
-126
lines changed

convert_hf_to_gguf.py

Lines changed: 48 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1210,6 +1210,54 @@ def _try_set_pooling_type(self) -> None:
12101210
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
12111211
self.gguf_writer.add_pooling_type(pooling_type)
12121212

1213+
def _set_vocab_interns1(self):
1214+
tokens: list[str] = []
1215+
toktypes: list[int] = []
1216+
1217+
from transformers import AutoTokenizer
1218+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
1219+
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
1220+
vocab_size = self.hparams.get("vocab_size", len(vocab))
1221+
assert max(vocab.values()) < vocab_size
1222+
1223+
tokpre = self.get_vocab_base_pre(tokenizer)
1224+
1225+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
1226+
added_vocab = tokenizer.get_added_vocab()
1227+
1228+
added_tokens_decoder = tokenizer.added_tokens_decoder
1229+
1230+
for i in range(vocab_size):
1231+
if i not in reverse_vocab:
1232+
tokens.append(f"[PAD{i}]")
1233+
toktypes.append(gguf.TokenType.UNUSED)
1234+
else:
1235+
token: str = reverse_vocab[i]
1236+
if token in added_vocab:
1237+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
1238+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
1239+
if not added_tokens_decoder[i].normalized:
1240+
previous_token = token
1241+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
1242+
if previous_token != token:
1243+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
1244+
1245+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
1246+
toktypes.append(gguf.TokenType.CONTROL)
1247+
else:
1248+
toktypes.append(gguf.TokenType.USER_DEFINED)
1249+
else:
1250+
toktypes.append(gguf.TokenType.NORMAL)
1251+
tokens.append(token)
1252+
1253+
self.gguf_writer.add_tokenizer_model("gpt2")
1254+
self.gguf_writer.add_tokenizer_pre(tokpre)
1255+
self.gguf_writer.add_token_list(tokens)
1256+
self.gguf_writer.add_token_types(toktypes)
1257+
1258+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1259+
special_vocab.add_to_gguf(self.gguf_writer)
1260+
12131261

12141262
class MmprojModel(ModelBase):
12151263
model_type = ModelType.MMPROJ
@@ -3603,69 +3651,6 @@ def set_vocab(self):
36033651

36043652
super().set_vocab()
36053653

3606-
def _set_vocab_interns1(self):
3607-
tokens: list[str] = []
3608-
toktypes: list[int] = []
3609-
3610-
from transformers import AutoTokenizer
3611-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
3612-
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
3613-
vocab_size = self.hparams.get("vocab_size", len(vocab))
3614-
assert max(vocab.values()) < vocab_size
3615-
3616-
tokpre = self.get_vocab_base_pre(tokenizer)
3617-
3618-
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
3619-
added_vocab = tokenizer.get_added_vocab()
3620-
3621-
added_tokens_decoder = tokenizer.added_tokens_decoder
3622-
3623-
for i in range(vocab_size):
3624-
if i not in reverse_vocab:
3625-
tokens.append(f"[PAD{i}]")
3626-
toktypes.append(gguf.TokenType.UNUSED)
3627-
else:
3628-
token: str = reverse_vocab[i]
3629-
if token in added_vocab:
3630-
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3631-
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
3632-
if not added_tokens_decoder[i].normalized:
3633-
previous_token = token
3634-
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
3635-
if previous_token != token:
3636-
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
3637-
3638-
if added_tokens_decoder[i].special or self.does_token_look_special(token):
3639-
toktypes.append(gguf.TokenType.CONTROL)
3640-
else:
3641-
toktypes.append(gguf.TokenType.USER_DEFINED)
3642-
else:
3643-
toktypes.append(gguf.TokenType.NORMAL)
3644-
tokens.append(token)
3645-
3646-
self.gguf_writer.add_tokenizer_model("gpt2")
3647-
self.gguf_writer.add_tokenizer_pre(tokpre)
3648-
self.gguf_writer.add_token_list(tokens)
3649-
self.gguf_writer.add_token_types(toktypes)
3650-
3651-
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3652-
special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
3653-
additional_special_tokens = []
3654-
if special_tokens_map_file.is_file():
3655-
with open(special_tokens_map_file, encoding = 'utf-8') as f:
3656-
additional_special_tokens = json.load(f).get('additional_special_tokens', [])
3657-
tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
3658-
if tokenizer_cfg_file.is_file():
3659-
with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
3660-
added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
3661-
token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
3662-
for token in additional_special_tokens:
3663-
if token in token2ids_map:
3664-
special_vocab._set_special_token(token, token2ids_map[token])
3665-
special_vocab._set_special_token('eos', 151645)
3666-
special_vocab._set_special_token("bos", 151643)
3667-
special_vocab.add_to_gguf(self.gguf_writer)
3668-
36693654

36703655
@ModelBase.register("Qwen3MoeForCausalLM")
36713656
class Qwen3MoeModel(Qwen2MoeModel):
@@ -3684,69 +3669,6 @@ def set_vocab(self):
36843669

36853670
super().set_vocab()
36863671

3687-
def _set_vocab_interns1(self):
3688-
tokens: list[str] = []
3689-
toktypes: list[int] = []
3690-
3691-
from transformers import AutoTokenizer
3692-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
3693-
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
3694-
vocab_size = self.hparams.get("vocab_size", len(vocab))
3695-
assert max(vocab.values()) < vocab_size
3696-
3697-
tokpre = self.get_vocab_base_pre(tokenizer)
3698-
3699-
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
3700-
added_vocab = tokenizer.get_added_vocab()
3701-
3702-
added_tokens_decoder = tokenizer.added_tokens_decoder
3703-
3704-
for i in range(vocab_size):
3705-
if i not in reverse_vocab:
3706-
tokens.append(f"[PAD{i}]")
3707-
toktypes.append(gguf.TokenType.UNUSED)
3708-
else:
3709-
token: str = reverse_vocab[i]
3710-
if token in added_vocab:
3711-
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3712-
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
3713-
if not added_tokens_decoder[i].normalized:
3714-
previous_token = token
3715-
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
3716-
if previous_token != token:
3717-
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
3718-
3719-
if added_tokens_decoder[i].special or self.does_token_look_special(token):
3720-
toktypes.append(gguf.TokenType.CONTROL)
3721-
else:
3722-
toktypes.append(gguf.TokenType.USER_DEFINED)
3723-
else:
3724-
toktypes.append(gguf.TokenType.NORMAL)
3725-
tokens.append(token)
3726-
3727-
self.gguf_writer.add_tokenizer_model("gpt2")
3728-
self.gguf_writer.add_tokenizer_pre(tokpre)
3729-
self.gguf_writer.add_token_list(tokens)
3730-
self.gguf_writer.add_token_types(toktypes)
3731-
3732-
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3733-
special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
3734-
additional_special_tokens = []
3735-
if special_tokens_map_file.is_file():
3736-
with open(special_tokens_map_file, encoding = 'utf-8') as f:
3737-
additional_special_tokens = json.load(f).get('additional_special_tokens', [])
3738-
tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
3739-
if tokenizer_cfg_file.is_file():
3740-
with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
3741-
added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
3742-
token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
3743-
for token in additional_special_tokens:
3744-
if token in token2ids_map:
3745-
special_vocab._set_special_token(token, token2ids_map[token])
3746-
special_vocab._set_special_token('eos', 151645)
3747-
special_vocab._set_special_token("bos", 151643)
3748-
special_vocab.add_to_gguf(self.gguf_writer)
3749-
37503672

37513673
@ModelBase.register("GPT2LMHeadModel")
37523674
class GPT2Model(TextModel):

0 commit comments

Comments
 (0)