Skip to content

Commit 5e78e88

Browse files
committed
almost working
1 parent 51886a4 commit 5e78e88

File tree

3 files changed

+48
-30
lines changed

3 files changed

+48
-30
lines changed

convert_hf_to_gguf.py

Lines changed: 44 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -842,14 +842,14 @@ def get_vocab_base_pre(self, tokenizer) -> str:
842842
def _set_vocab_none(self) -> None:
843843
self.gguf_writer.add_tokenizer_model("none")
844844

845-
def _set_vocab_gpt2(self) -> None:
845+
def _set_vocab_gpt2(self, load_merges=True) -> None:
846846
tokens, toktypes, tokpre = self.get_vocab_base()
847847
self.gguf_writer.add_tokenizer_model("gpt2")
848848
self.gguf_writer.add_tokenizer_pre(tokpre)
849849
self.gguf_writer.add_token_list(tokens)
850850
self.gguf_writer.add_token_types(toktypes)
851851

852-
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
852+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges)
853853
special_vocab.add_to_gguf(self.gguf_writer)
854854

855855
def _set_vocab_qwen(self):
@@ -6394,15 +6394,14 @@ def set_gguf_parameters(self):
63946394

63956395

63966396
@ModelBase.register("HunYuanMoEV1ForCausalLM")
6397-
class HunYuanMoEModel(LlamaModel):
6397+
class HunYuanMoEModel(TextModel):
63986398
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
6399-
undo_permute = False
64006399

64016400
def __init__(self, *args, **kwargs):
64026401
super().__init__(*args, **kwargs)
64036402

64046403
def set_vocab(self):
6405-
self._set_vocab_gpt2()
6404+
self._set_vocab_gpt2(load_merges=False)
64066405

64076406
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
64086407
tokens: list[str] = []
@@ -6411,52 +6410,41 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
64116410
from transformers import AutoTokenizer
64126411
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
64136412

6414-
# merge logic is copied from QwenModel, maybe incorrect
64156413
merges = []
6416-
vocab = {}
64176414
mergeable_ranks = tokenizer.mergeable_ranks
64186415
for token, rank in mergeable_ranks.items():
6419-
vocab[QwenModel.token_bytes_to_string(token)] = rank
64206416
if len(token) == 1:
64216417
continue
6418+
# bpe() will decompose the token into its smallest parts and then
6419+
# re-merge them. If the token is a valid merge, bpe() will return
6420+
# the two pieces that were merged to create it.
64226421
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
64236422
if len(merged) == 2:
64246423
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
64256424
self.gguf_writer.add_token_merges(merges)
64266425

6426+
vocab_size = self.hparams["vocab_size"]
6427+
64276428
reverse_vocab = tokenizer.decoder
6428-
assert max(reverse_vocab.keys()) < tokenizer.vocab_size
6429+
assert max(reverse_vocab.keys()) < tokenizer.vocab_size, tokenizer.vocab_size == vocab_size
64296430

64306431
tokpre = self.get_vocab_base_pre(tokenizer)
6431-
added_vocab = tokenizer.get_added_vocab()
6432+
special_token_ids = set(tokenizer.special_tokens.values())
64326433

6433-
added_tokens_decoder = tokenizer.added_tokens_decoder
6434+
tokens: list[str] = []
6435+
toktypes: list[int] = []
64346436

6435-
for i in range(tokenizer.vocab_size):
6437+
for i in range(vocab_size):
64366438
if i not in reverse_vocab:
64376439
tokens.append(f"[PAD{i}]")
64386440
toktypes.append(gguf.TokenType.UNUSED)
64396441
else:
6440-
token: str = reverse_vocab[i]
6441-
if token in added_vocab:
6442-
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
6443-
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
6444-
if not added_tokens_decoder[i].normalized:
6445-
previous_token = token
6446-
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
6447-
if previous_token != token:
6448-
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
6449-
6450-
if added_tokens_decoder[i].special or self.does_token_look_special(token):
6451-
toktypes.append(gguf.TokenType.CONTROL)
6452-
else:
6453-
# NOTE: this was added for Gemma.
6454-
# Encoding and decoding the tokens above isn't sufficient for this case.
6455-
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
6456-
toktypes.append(gguf.TokenType.USER_DEFINED)
6442+
token = reverse_vocab[i]
6443+
tokens.append(token)
6444+
if i in special_token_ids:
6445+
toktypes.append(gguf.TokenType.CONTROL)
64576446
else:
64586447
toktypes.append(gguf.TokenType.NORMAL)
6459-
tokens.append(token)
64606448

64616449
return tokens, toktypes, tokpre
64626450

@@ -6474,6 +6462,25 @@ def set_gguf_parameters(self):
64746462
assert all(topk == moe_topk[0] for topk in moe_topk)
64756463
self.gguf_writer.add_expert_used_count(moe_topk[0])
64766464

6465+
moe_shared_expert = self.hparams["num_shared_expert"]
6466+
assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
6467+
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
6468+
6469+
self.gguf_writer.add_qk_norm(self.hparams.get("use_qk_norm", True))
6470+
6471+
# Rope
6472+
rope_scaling = self.hparams.get("rope_scaling", {})
6473+
if rope_scaling.get("type") == "dynamic":
6474+
logger.warning("Model uses 'dynamic' rope scaling, which is not yet supported in GGUF. "
6475+
"The resulting model may not work correctly with contexts longer than the training length.")
6476+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
6477+
else:
6478+
# Fallback for other potential scaling types
6479+
# This part is inherited from TextModel and will handle standard rope_theta
6480+
pass
6481+
6482+
_experts: list[dict[str, Tensor]] | None = None
6483+
64776484
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
64786485
# process the experts separately
64796486
if name.find("mlp.experts") != -1:
@@ -6511,6 +6518,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
65116518

65126519
return [(self.map_tensor_name(name), data_torch)]
65136520

6521+
def prepare_tensors(self):
6522+
super().prepare_tensors()
6523+
if self._experts is not None:
6524+
experts = [k for d in self._experts for k in d.keys()]
6525+
if len(experts) > 0:
6526+
raise ValueError(f"Unprocessed experts: {experts}")
6527+
65146528
###### CONVERSION LOGIC ######
65156529

65166530

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ class Attention:
148148
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
149149
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
150150
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
151+
QK_NORM = "{arch}.attention.qk_norm"
151152

152153
class Rope:
153154
DIMENSION_COUNT = "{arch}.rope.dimension_count"

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,9 @@ def add_group_norm_groups(self, value: int) -> None:
792792
def add_causal_attention(self, value: bool) -> None:
793793
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
794794

795+
def add_qk_norm(self, value: bool) -> None:
796+
self.add_bool(Keys.Attention.QK_NORM.format(arch=self.arch), value)
797+
795798
def add_q_lora_rank(self, length: int) -> None:
796799
self.add_uint32(Keys.Attention.Q_LORA_RANK.format(arch=self.arch), length)
797800

0 commit comments

Comments
 (0)