@@ -90,7 +90,8 @@ const char * llm_type_name(llm_type type) {
9090 case LLM_TYPE_57B_A14B: return "57B.A14B";
9191 case LLM_TYPE_27B: return "27B";
9292 case LLM_TYPE_290B: return "290B";
93- case LLM_TYPE_17B_16E: return "17Bx16E";
93+ case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
94+ case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
9495 default: return "?B";
9596 }
9697}
@@ -555,11 +556,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
555556 {
556557 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
557558 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
558- hparams.f_attention_scale = 0.1 ;
559+ ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step) ;
559560
560- switch (hparams.n_layer) {
561- case 48: type = LLM_TYPE_17B_16E; break;
562- default: type = LLM_TYPE_UNKNOWN;
561+ switch (hparams.n_expert) {
562+ case 16: type = LLM_TYPE_17B_16E; break;
563+ case 128: type = LLM_TYPE_17B_128E; break;
564+ default: type = LLM_TYPE_UNKNOWN;
565+ }
566+
567+ if (type == LLM_TYPE_17B_128E) {
568+ hparams.use_kq_norm = false;
563569 }
564570 } break;
565571 case LLM_ARCH_DECI:
@@ -1643,7 +1649,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
16431649 const auto tn = LLM_TN(arch);
16441650 switch (arch) {
16451651 case LLM_ARCH_LLAMA:
1646- case LLM_ARCH_LLAMA4:
16471652 case LLM_ARCH_REFACT:
16481653 case LLM_ARCH_MINICPM:
16491654 case LLM_ARCH_GRANITE:
@@ -1661,8 +1666,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
16611666 }
16621667
16631668 for (int i = 0; i < n_layer; ++i) {
1664- bool is_moe_layer = (i + 1) % hparams.interleave_moe_layer_step == 0;
1665-
16661669 auto & layer = layers[i];
16671670
16681671 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
@@ -1688,8 +1691,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
16881691 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
16891692 }
16901693
1691- int n_ff_exp = hparams.n_ff_exp;
1692- if (n_expert == 0 || !is_moe_layer) {
1694+ if (n_expert == 0) {
16931695 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
16941696 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
16951697 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -1700,17 +1702,59 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
17001702 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
17011703 } else {
17021704 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1703- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
1705+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1706+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1707+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1708+ }
1709+ }
1710+ } break;
1711+ case LLM_ARCH_LLAMA4:
1712+ {
1713+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1714+
1715+ // output
1716+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1717+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1718+
1719+ // if output is NULL, init from the input tok embed
1720+ if (output == NULL) {
1721+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
1722+ }
1723+
1724+ GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
1725+ for (int i = 0; i < n_layer; ++i) {
1726+ bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
1727+
1728+ auto & layer = layers[i];
1729+
1730+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1731+
1732+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
1733+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1734+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1735+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
1736+
1737+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1738+
1739+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1740+
1741+ if (is_moe_layer) {
1742+ int n_ff_exp = hparams.n_ff_exp;
1743+
1744+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1745+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
17041746 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
17051747 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
17061748
1707- // Shared expert branch (only used by llama 4 for now)
1708- if (arch == LLM_ARCH_LLAMA4) {
1709- const int64_t n_ff_shexp = n_ff_exp;
1710- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1711- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
1712- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1713- }
1749+ // Shared expert
1750+ const int64_t n_ff_shexp = n_ff_exp;
1751+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1752+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
1753+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1754+ } else {
1755+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1756+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1757+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
17141758 }
17151759 }
17161760 } break;
@@ -4234,7 +4278,7 @@ struct llm_build_llama : public llm_graph_context {
42344278 ggml_tensor * inpSA = inpL;
42354279
42364280 bool use_rope = arch == LLM_ARCH_LLAMA4
4237- ? (il + 1) % hparams.no_rope_layer_interval != 0
4281+ ? (il + 1) % hparams.n_no_rope_layer_step != 0
42384282 : true;
42394283
42404284 // norm
@@ -4298,9 +4342,8 @@ struct llm_build_llama : public llm_graph_context {
42984342 cb(Kcur, "Kcur", il);
42994343 cb(Vcur, "Vcur", il);
43004344
4301- if (arch == LLM_ARCH_LLAMA4 && use_rope) {
4345+ if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm ) {
43024346 // Llama4TextL2Norm
4303- // TODO @ngxson : the 128E model does not use qk_norm
43044347 Qcur = ggml_rms_norm(ctx0, Qcur, 1e-6);
43054348 Kcur = ggml_rms_norm(ctx0, Kcur, 1e-6);
43064349 cb(Qcur, "Qcur_normed", il);
0 commit comments