Skip to content

Commit 23c7da7

Browse files
committed
support interns1-mini
1 parent f44f793 commit 23c7da7

File tree

1 file changed

+79
-5
lines changed

1 file changed

+79
-5
lines changed

convert_hf_to_gguf.py

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2917,7 +2917,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
29172917
if "language_model." in name:
29182918
name = name.replace("language_model.", "") # for InternVL
29192919
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
2920-
or name.startswith("vision_model") or name.startswith("audio_tower"):
2920+
or name.startswith("vision_model") or name.startswith("audio_tower") \
2921+
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
29212922
# skip vision and audio tensors
29222923
return []
29232924
yield from super().modify_tensors(data_torch, name, bid)
@@ -3589,6 +3590,82 @@ def prepare_tensors(self):
35893590
class Qwen3Model(Qwen2Model):
35903591
model_arch = gguf.MODEL_ARCH.QWEN3
35913592

3593+
def __init__(self, *args, **kwargs):
3594+
super().__init__(*args, **kwargs)
3595+
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
3596+
self.origin_hf_arch = hparams.get('architectures', [None])[0]
3597+
3598+
def set_vocab(self):
3599+
# deal with intern-s1-mini
3600+
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
3601+
self._set_vocab_interns1()
3602+
return
3603+
3604+
super().set_vocab()
3605+
3606+
def _set_vocab_interns1(self):
3607+
tokens: list[str] = []
3608+
toktypes: list[int] = []
3609+
3610+
from transformers import AutoTokenizer
3611+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
3612+
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
3613+
vocab_size = self.hparams.get("vocab_size", len(vocab))
3614+
assert max(vocab.values()) < vocab_size
3615+
3616+
tokpre = self.get_vocab_base_pre(tokenizer)
3617+
3618+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
3619+
added_vocab = tokenizer.get_added_vocab()
3620+
3621+
added_tokens_decoder = tokenizer.added_tokens_decoder
3622+
3623+
for i in range(vocab_size):
3624+
if i not in reverse_vocab:
3625+
tokens.append(f"[PAD{i}]")
3626+
toktypes.append(gguf.TokenType.UNUSED)
3627+
else:
3628+
token: str = reverse_vocab[i]
3629+
if token in added_vocab:
3630+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3631+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
3632+
if not added_tokens_decoder[i].normalized:
3633+
previous_token = token
3634+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
3635+
if previous_token != token:
3636+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
3637+
3638+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
3639+
toktypes.append(gguf.TokenType.CONTROL)
3640+
else:
3641+
toktypes.append(gguf.TokenType.USER_DEFINED)
3642+
else:
3643+
toktypes.append(gguf.TokenType.NORMAL)
3644+
tokens.append(token)
3645+
3646+
self.gguf_writer.add_tokenizer_model("gpt2")
3647+
self.gguf_writer.add_tokenizer_pre(tokpre)
3648+
self.gguf_writer.add_token_list(tokens)
3649+
self.gguf_writer.add_token_types(toktypes)
3650+
3651+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3652+
special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
3653+
additional_special_tokens = []
3654+
if special_tokens_map_file.is_file():
3655+
with open(special_tokens_map_file, encoding = 'utf-8') as f:
3656+
additional_special_tokens = json.load(f).get('additional_special_tokens', [])
3657+
tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
3658+
if tokenizer_cfg_file.is_file():
3659+
with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
3660+
added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
3661+
token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
3662+
for token in additional_special_tokens:
3663+
if token in token2ids_map:
3664+
special_vocab._set_special_token(token, token2ids_map[token])
3665+
special_vocab._set_special_token('eos', 151645)
3666+
special_vocab._set_special_token("bos", 151643)
3667+
special_vocab.add_to_gguf(self.gguf_writer)
3668+
35923669

35933670
@ModelBase.register("Qwen3MoeForCausalLM")
35943671
class Qwen3MoeModel(Qwen2MoeModel):
@@ -3605,10 +3682,7 @@ def set_vocab(self):
36053682
self._set_vocab_interns1()
36063683
return
36073684

3608-
try:
3609-
self._set_vocab_sentencepiece()
3610-
except FileNotFoundError:
3611-
self._set_vocab_gpt2()
3685+
super().set_vocab()
36123686

36133687
def _set_vocab_interns1(self):
36143688
tokens: list[str] = []

0 commit comments

Comments
 (0)