Skip to content

Commit 245db15

Browse files
committed
cleanup
1 parent b19ecae commit 245db15

File tree

3 files changed

+36
-50
lines changed

3 files changed

+36
-50
lines changed

convert_hf_to_gguf.py

Lines changed: 32 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -6399,20 +6399,22 @@ class HunYuanMoEModel(TextModel):
63996399

64006400
def __init__(self, *args, **kwargs):
64016401
super().__init__(*args, **kwargs)
6402-
# FIX for tied embeddings: Capture the token embeddings.
6402+
# For handling tied embeddings
64036403
self._tok_embd = None
64046404

64056405
def set_vocab(self):
6406-
self._set_vocab_gpt2(load_merges=False)
6407-
# FIX for BOS token: Manually set the correct BOS token ID.
6408-
# The SpecialVocab helper gets incorrect id `bos_token_id: 1` from config.json.
6409-
self.gguf_writer.add_bos_token_id(127959) # <|bos|>
6410-
6411-
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
6406+
"""
6407+
A self-contained vocab implementation for the HunYuan tiktoken-based tokenizer.
6408+
This method correctly generates tokens, types, and the required "fake" merges
6409+
to satisfy the llama.cpp GGUF loader.
6410+
"""
64126411
from transformers import AutoTokenizer
64136412
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
64146413

6415-
# Fake merges
6414+
# 1. Get the pre-tokenizer identifier hash
6415+
tokpre = self.get_vocab_base_pre(tokenizer)
6416+
6417+
# 2. Reverse-engineer the merges list from mergeable_ranks
64166418
merges = []
64176419
mergeable_ranks = tokenizer.mergeable_ranks
64186420
for token, rank in mergeable_ranks.items():
@@ -6421,19 +6423,13 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
64216423
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
64226424
if len(merged) == 2:
64236425
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
6424-
self.gguf_writer.add_token_merges(merges)
64256426

6427+
# 3. Generate the tokens and toktypes lists
64266428
vocab_size = self.hparams["vocab_size"]
6427-
64286429
reverse_vocab = tokenizer.decoder
6429-
assert max(reverse_vocab.keys()) < tokenizer.vocab_size, tokenizer.vocab_size == vocab_size
6430-
6431-
tokpre = self.get_vocab_base_pre(tokenizer)
64326430
special_token_ids = set(tokenizer.special_tokens.values())
6433-
64346431
tokens: list[str] = []
64356432
toktypes: list[int] = []
6436-
64376433
for i in range(vocab_size):
64386434
if i not in reverse_vocab:
64396435
tokens.append(f"[PAD{i}]")
@@ -6446,30 +6442,42 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
64466442
else:
64476443
toktypes.append(gguf.TokenType.NORMAL)
64486444

6449-
return tokens, toktypes, tokpre
6445+
# 4. Write all vocab-related fields to the GGUF writer
6446+
self.gguf_writer.add_tokenizer_model("gpt2")
6447+
self.gguf_writer.add_tokenizer_pre(tokpre)
6448+
self.gguf_writer.add_token_list(tokens)
6449+
self.gguf_writer.add_token_types(toktypes)
6450+
self.gguf_writer.add_token_merges(merges)
6451+
6452+
# 5. Add special tokens and chat templates
6453+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
6454+
special_vocab.add_to_gguf(self.gguf_writer)
6455+
# FIX for BOS token: Manually set the correct BOS token ID.
6456+
self.gguf_writer.add_bos_token_id(127959) # <|bos|>
64506457

64516458
def set_gguf_parameters(self):
64526459
super().set_gguf_parameters()
6460+
hparams = self.hparams
64536461

6454-
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
6455-
self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["intermediate_size"])
6462+
self.gguf_writer.add_expert_count(hparams["num_experts"])
6463+
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
64566464

6457-
moe_intermediate_size = self.hparams["moe_intermediate_size"]
6465+
moe_intermediate_size = hparams["moe_intermediate_size"]
64586466
assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
64596467
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
64606468

6461-
moe_topk = self.hparams["moe_topk"]
6469+
moe_topk = hparams["moe_topk"]
64626470
assert all(topk == moe_topk[0] for topk in moe_topk)
64636471
self.gguf_writer.add_expert_used_count(moe_topk[0])
64646472

6465-
moe_shared_expert = self.hparams["num_shared_expert"]
6473+
moe_shared_expert = hparams["num_shared_expert"]
64666474
assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
64676475
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
64686476

6469-
self.gguf_writer.add_qk_norm(self.hparams.get("use_qk_norm", True))
6477+
self.gguf_writer.add_qk_norm(hparams.get("use_qk_norm", True))
64706478

64716479
# Rope
6472-
rope_scaling = self.hparams.get("rope_scaling", {})
6480+
rope_scaling = hparams.get("rope_scaling", {})
64736481
if rope_scaling.get("type") == "dynamic":
64746482
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
64756483
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
@@ -6478,50 +6486,33 @@ def set_gguf_parameters(self):
64786486
_experts: list[dict[str, Tensor]] | None = None
64796487

64806488
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6481-
# FIX for tied embeddings: Capture the token embeddings.
64826489
if name == "model.embed_tokens.weight":
64836490
self._tok_embd = data_torch.clone()
6484-
6485-
# FIX for tied embeddings: Skip the lm_head if it's tied.
64866491
if name == "lm_head.weight":
64876492
if self.hparams.get("tie_word_embeddings", False):
64886493
logger.info("Skipping tied output layer 'lm_head.weight'")
64896494
return []
6490-
6491-
# process the experts separately
64926495
if name.find("mlp.experts") != -1:
64936496
n_experts = self.hparams["num_experts"]
64946497
assert bid is not None
6495-
6496-
tensors: list[tuple[str, Tensor]] = []
6497-
64986498
if self._experts is None:
64996499
self._experts = [{} for _ in range(self.block_count)]
6500-
65016500
self._experts[bid][name] = data_torch
6502-
65036501
if len(self._experts[bid]) >= n_experts * 3:
6504-
# merge the experts into a single 3d tensor
6502+
tensors: list[tuple[str, Tensor]] = []
65056503
for w_name in ["down_proj", "gate_proj", "up_proj"]:
65066504
datas: list[Tensor] = []
6507-
65086505
for xid in range(n_experts):
65096506
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
65106507
datas.append(self._experts[bid][ename])
65116508
del self._experts[bid][ename]
6512-
65136509
data_torch = torch.stack(datas, dim=0)
6514-
65156510
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
6516-
65176511
new_name = self.map_tensor_name(merged_name)
6518-
65196512
tensors.append((new_name, data_torch))
6520-
65216513
return tensors
65226514
else:
65236515
return []
6524-
65256516
return [(self.map_tensor_name(name), data_torch)]
65266517

65276518
def prepare_tensors(self):

src/llama-graph.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -705,13 +705,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
705705
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
706706
cb(weights, "ffn_moe_weights", il);
707707

708-
if (arch == LLM_ARCH_HUNYUAN_MOE) {
709-
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_expert_used, n_tokens]
710-
weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [1, n_tokens]
711-
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); // [1, n_expert_used, n_tokens]
712-
cb(weights, "ffn_moe_weights_scaled", il);
713-
}
714-
715708
if (norm_w) {
716709
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
717710

src/llama-model.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14432,8 +14432,10 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
1443214432
model.layers[il].ffn_down_exps,
1443314433
nullptr,
1443414434
n_expert, n_expert_used,
14435-
LLM_FFN_SILU, false,
14436-
false, 0.0,
14435+
LLM_FFN_SILU,
14436+
true, // norm_topk_prob
14437+
false,
14438+
0.0,
1443714439
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1443814440
il);
1443914441
cb(cur_moe, "ffn_moe_out", il);

0 commit comments

Comments
 (0)