Skip to content

Commit 46fe5cb

Browse files
committed
clean up model conversion
1 parent a518c11 commit 46fe5cb

File tree

7 files changed

+82
-32
lines changed

7 files changed

+82
-32
lines changed

convert_hf_to_gguf.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1774,7 +1774,7 @@ def __init__(self, *args, **kwargs):
17741774
if "vision_config" in hparams:
17751775
logger.info("Has vision encoder, but it will be ignored")
17761776
self.has_vision = True
1777-
# hacky renaming
1777+
# IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
17781778
self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
17791779
self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
17801780

@@ -1783,16 +1783,15 @@ def set_vocab(self):
17831783

17841784
def set_gguf_parameters(self):
17851785
super().set_gguf_parameters()
1786-
# TODO @ngxson : this is for testing, will be cleaned up later
1787-
self.gguf_writer.add_uint32("llama4.interleave_moe_layer_step", self.hparams["interleave_moe_layer_step"])
1788-
self.gguf_writer.add_uint32("llama4.no_rope_layer_interval", 4) # every 4th layer
1789-
self.gguf_writer.add_uint32("llama4.expert_feed_forward_length", self.hparams["intermediate_size_moe"])
1786+
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
1787+
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
17901788

17911789
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
17921790
name = name.replace("language_model.", "")
17931791
name = name.replace("feed_forward.", "mlp.") # a bit hacky for now
17941792
name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now
17951793

1794+
# split the gate_up into gate and up
17961795
if "gate_up_proj" in name:
17971796
name_up = name.replace("gate_up_proj", "up_proj.weight")
17981797
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
@@ -1802,7 +1801,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
18021801
(self.map_tensor_name(name_gate), gate_proj_weight),
18031802
(self.map_tensor_name(name_up), up_proj_weight)
18041803
]
1805-
1804+
18061805
if name.endswith("down_proj"):
18071806
name += ".weight"
18081807
data_torch = data_torch.transpose(-1, -2)

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,9 @@ def add_wkv_head_size(self, size: int) -> None:
746746
def add_token_shift_count(self, count: int) -> None:
747747
self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
748748

749+
def add_interleave_moe_layer_step(self, value: int) -> None:
750+
self.add_uint32(Keys.LLM.INTERLEAVE_MOE_LAYER_STEP.format(arch=self.arch), value)
751+
749752
def add_layer_norm_eps(self, value: float) -> None:
750753
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
751754

src/llama-arch.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
115115
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
116116
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
117117
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
118+
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
118119

119120
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
120121
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ enum llm_kv {
119119
LLM_KV_RESIDUAL_SCALE,
120120
LLM_KV_EMBEDDING_SCALE,
121121
LLM_KV_TOKEN_SHIFT_COUNT,
122+
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
122123

123124
LLM_KV_ATTENTION_HEAD_COUNT,
124125
LLM_KV_ATTENTION_HEAD_COUNT_KV,

src/llama-hparams.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,13 @@ struct llama_hparams {
112112
bool use_alibi = false;
113113
bool attn_soft_cap = false;
114114

115-
// TODO @ngxson : variable names taken from python code, we can rename it later
116-
uint32_t interleave_moe_layer_step = 1; // TODO read from gguf
117-
uint32_t no_rope_layer_interval = 4; // TODO read from gguf
118-
uint32_t attn_temperature_tuning = 4; // TODO read from gguf
119-
uint32_t floor_scale = 8192; // TODO read from gguf
115+
uint32_t n_moe_layer_step = 0;
116+
bool use_kq_norm = true;
117+
// values below seems to be fixed on llama4
118+
uint32_t n_no_rope_layer_step = 4;
119+
uint32_t n_attn_temp_tuning = 4;
120+
uint32_t n_attn_temp_floor_scale = 8192;
121+
float f_attn_temp_scale = 0.1;
120122

121123
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
122124
// ref: https://github.com/ggerganov/llama.cpp/pull/8141

src/llama-model.cpp

Lines changed: 64 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ const char * llm_type_name(llm_type type) {
9090
case LLM_TYPE_57B_A14B: return "57B.A14B";
9191
case LLM_TYPE_27B: return "27B";
9292
case LLM_TYPE_290B: return "290B";
93-
case LLM_TYPE_17B_16E: return "17Bx16E";
93+
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
94+
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
9495
default: return "?B";
9596
}
9697
}
@@ -555,11 +556,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
555556
{
556557
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
557558
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
558-
hparams.f_attention_scale = 0.1;
559+
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
559560

560-
switch (hparams.n_layer) {
561-
case 48: type = LLM_TYPE_17B_16E; break;
562-
default: type = LLM_TYPE_UNKNOWN;
561+
switch (hparams.n_expert) {
562+
case 16: type = LLM_TYPE_17B_16E; break;
563+
case 128: type = LLM_TYPE_17B_128E; break;
564+
default: type = LLM_TYPE_UNKNOWN;
565+
}
566+
567+
if (type == LLM_TYPE_17B_128E) {
568+
hparams.use_kq_norm = false;
563569
}
564570
} break;
565571
case LLM_ARCH_DECI:
@@ -1643,7 +1649,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
16431649
const auto tn = LLM_TN(arch);
16441650
switch (arch) {
16451651
case LLM_ARCH_LLAMA:
1646-
case LLM_ARCH_LLAMA4:
16471652
case LLM_ARCH_REFACT:
16481653
case LLM_ARCH_MINICPM:
16491654
case LLM_ARCH_GRANITE:
@@ -1661,8 +1666,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
16611666
}
16621667

16631668
for (int i = 0; i < n_layer; ++i) {
1664-
bool is_moe_layer = (i + 1) % hparams.interleave_moe_layer_step == 0;
1665-
16661669
auto & layer = layers[i];
16671670

16681671
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
@@ -1688,8 +1691,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
16881691
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
16891692
}
16901693

1691-
int n_ff_exp = hparams.n_ff_exp;
1692-
if (n_expert == 0 || !is_moe_layer) {
1694+
if (n_expert == 0) {
16931695
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
16941696
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
16951697
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -1700,17 +1702,59 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
17001702
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
17011703
} else {
17021704
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1703-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
1705+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1706+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1707+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1708+
}
1709+
}
1710+
} break;
1711+
case LLM_ARCH_LLAMA4:
1712+
{
1713+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1714+
1715+
// output
1716+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1717+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1718+
1719+
// if output is NULL, init from the input tok embed
1720+
if (output == NULL) {
1721+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
1722+
}
1723+
1724+
GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
1725+
for (int i = 0; i < n_layer; ++i) {
1726+
bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
1727+
1728+
auto & layer = layers[i];
1729+
1730+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1731+
1732+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
1733+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1734+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1735+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
1736+
1737+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1738+
1739+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1740+
1741+
if (is_moe_layer) {
1742+
int n_ff_exp = hparams.n_ff_exp;
1743+
1744+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1745+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
17041746
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
17051747
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
17061748

1707-
// Shared expert branch (only used by llama 4 for now)
1708-
if (arch == LLM_ARCH_LLAMA4) {
1709-
const int64_t n_ff_shexp = n_ff_exp;
1710-
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1711-
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
1712-
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1713-
}
1749+
// Shared expert
1750+
const int64_t n_ff_shexp = n_ff_exp;
1751+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1752+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
1753+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1754+
} else {
1755+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1756+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1757+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
17141758
}
17151759
}
17161760
} break;
@@ -4234,7 +4278,7 @@ struct llm_build_llama : public llm_graph_context {
42344278
ggml_tensor * inpSA = inpL;
42354279

42364280
bool use_rope = arch == LLM_ARCH_LLAMA4
4237-
? (il + 1) % hparams.no_rope_layer_interval != 0
4281+
? (il + 1) % hparams.n_no_rope_layer_step != 0
42384282
: true;
42394283

42404284
// norm
@@ -4298,9 +4342,8 @@ struct llm_build_llama : public llm_graph_context {
42984342
cb(Kcur, "Kcur", il);
42994343
cb(Vcur, "Vcur", il);
43004344

4301-
if (arch == LLM_ARCH_LLAMA4 && use_rope) {
4345+
if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
43024346
// Llama4TextL2Norm
4303-
// TODO @ngxson : the 128E model does not use qk_norm
43044347
Qcur = ggml_rms_norm(ctx0, Qcur, 1e-6);
43054348
Kcur = ggml_rms_norm(ctx0, Kcur, 1e-6);
43064349
cb(Qcur, "Qcur_normed", il);

src/llama-model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ enum llm_type {
8787
LLM_TYPE_27B,
8888
LLM_TYPE_290B,
8989
LLM_TYPE_17B_16E, // llama4 Scout
90+
LLM_TYPE_17B_128E, // llama4 Maverick
9091
};
9192

9293
struct llama_layer_posnet {

0 commit comments

Comments
 (0)