clean up model conversion

ngxson · ngxson · commit 46fe5cbf790b · 2025-04-07T11:47:44.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1774,7 +1774,7 @@ def __init__(self, *args, **kwargs):
         if "vision_config" in hparams:
             logger.info("Has vision encoder, but it will be ignored")
             self.has_vision = True
-        # hacky renaming
+        # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
         self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
         self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
 
@@ -1783,16 +1783,15 @@ def set_vocab(self):
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        # TODO @ngxson : this is for testing, will be cleaned up later
-        self.gguf_writer.add_uint32("llama4.interleave_moe_layer_step", self.hparams["interleave_moe_layer_step"])
-        self.gguf_writer.add_uint32("llama4.no_rope_layer_interval", 4) # every 4th layer
-        self.gguf_writer.add_uint32("llama4.expert_feed_forward_length", self.hparams["intermediate_size_moe"])
+        self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
+        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
         name = name.replace("language_model.", "")
         name = name.replace("feed_forward.", "mlp.") # a bit hacky for now
         name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now
 
+        # split the gate_up into gate and up
         if "gate_up_proj" in name:
             name_up = name.replace("gate_up_proj", "up_proj.weight")
             name_gate = name.replace("gate_up_proj", "gate_proj.weight")
@@ -1802,7 +1801,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
                 (self.map_tensor_name(name_gate), gate_proj_weight),
                 (self.map_tensor_name(name_up), up_proj_weight)
             ]
-        
+
         if name.endswith("down_proj"):
             name += ".weight"
             data_torch = data_torch.transpose(-1, -2)
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -746,6 +746,9 @@ def add_wkv_head_size(self, size: int) -> None:
     def add_token_shift_count(self, count: int) -> None:
         self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
 
+    def add_interleave_moe_layer_step(self, value: int) -> None:
+        self.add_uint32(Keys.LLM.INTERLEAVE_MOE_LAYER_STEP.format(arch=self.arch), value)
+
     def add_layer_norm_eps(self, value: float) -> None:
         self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -115,6 +115,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
     { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
+    { LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         "%s.interleave_moe_layer_step"         },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,                   "%s.attention.head_count"                   },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,                "%s.attention.head_count_kv"                },
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -119,6 +119,7 @@ enum llm_kv {
     LLM_KV_RESIDUAL_SCALE,
     LLM_KV_EMBEDDING_SCALE,
     LLM_KV_TOKEN_SHIFT_COUNT,
+    LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -112,11 +112,13 @@ struct llama_hparams {
     bool use_alibi     = false;
     bool attn_soft_cap = false;
 
-    // TODO @ngxson : variable names taken from python code, we can rename it later
-    uint32_t interleave_moe_layer_step = 1; // TODO read from gguf
-    uint32_t no_rope_layer_interval = 4; // TODO read from gguf
-    uint32_t attn_temperature_tuning = 4; // TODO read from gguf
-    uint32_t floor_scale = 8192; // TODO read from gguf
+    uint32_t n_moe_layer_step        = 0;
+    bool     use_kq_norm             = true;
+    // values below seems to be fixed on llama4
+    uint32_t n_no_rope_layer_step    = 4;
+    uint32_t n_attn_temp_tuning      = 4;
+    uint32_t n_attn_temp_floor_scale = 8192;
+    float    f_attn_temp_scale       = 0.1;
 
     // needed by encoder-decoder models (e.g. T5, FLAN-T5)
     // ref: https://github.com/ggerganov/llama.cpp/pull/8141
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -90,7 +90,8 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_57B_A14B:      return "57B.A14B";
         case LLM_TYPE_27B:           return "27B";
         case LLM_TYPE_290B:          return "290B";
-        case LLM_TYPE_17B_16E:       return "17Bx16E";
+        case LLM_TYPE_17B_16E:       return "17Bx16E (Scout)";
+        case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
         default:                     return "?B";
     }
 }
@@ -555,11 +556,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                hparams.f_attention_scale = 0.1;
+                ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
 
-                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_17B_16E; break;
-                    default: type = LLM_TYPE_UNKNOWN;
+                switch (hparams.n_expert) {
+                    case 16:  type = LLM_TYPE_17B_16E; break;
+                    case 128: type = LLM_TYPE_17B_128E; break;
+                    default:  type = LLM_TYPE_UNKNOWN;
+                }
+
+                if (type == LLM_TYPE_17B_128E) {
+                    hparams.use_kq_norm = false;
                 }
             } break;
         case LLM_ARCH_DECI:
@@ -1643,7 +1649,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         const auto tn = LLM_TN(arch);
         switch (arch) {
             case LLM_ARCH_LLAMA:
-            case LLM_ARCH_LLAMA4:
             case LLM_ARCH_REFACT:
             case LLM_ARCH_MINICPM:
             case LLM_ARCH_GRANITE:
@@ -1661,8 +1666,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        bool is_moe_layer = (i + 1) % hparams.interleave_moe_layer_step == 0;
-
                         auto & layer = layers[i];
 
                         layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
@@ -1688,8 +1691,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                         }
 
-                        int n_ff_exp = hparams.n_ff_exp;
-                        if (n_expert == 0 || !is_moe_layer) {
+                        if (n_expert == 0) {
                             layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                             layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                             layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
@@ -1700,17 +1702,59 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
                         } else {
                             layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+                        }
+                    }
+                } break;
+            case LLM_ARCH_LLAMA4:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
+                    for (int i = 0; i < n_layer; ++i) {
+                        bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
+
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+                        if (is_moe_layer) {
+                            int n_ff_exp = hparams.n_ff_exp;
+
+                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
                             layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
                             layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
 
-                            // Shared expert branch (only used by llama 4 for now)
-                            if (arch == LLM_ARCH_LLAMA4) {
-                                const int64_t n_ff_shexp = n_ff_exp;
-                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
-                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd    }, 0);
-                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
-                            }
+                            // Shared expert
+                            const int64_t n_ff_shexp = n_ff_exp;
+                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
+                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd    }, 0);
+                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
+                        } else {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                         }
                     }
                 } break;
@@ -4234,7 +4278,7 @@ struct llm_build_llama : public llm_graph_context {
             ggml_tensor * inpSA = inpL;
 
             bool use_rope = arch == LLM_ARCH_LLAMA4
-                ? (il + 1) % hparams.no_rope_layer_interval != 0
+                ? (il + 1) % hparams.n_no_rope_layer_step != 0
                 : true;
 
             // norm
@@ -4298,9 +4342,8 @@ struct llm_build_llama : public llm_graph_context {
                 cb(Kcur, "Kcur", il);
                 cb(Vcur, "Vcur", il);
 
-                if (arch == LLM_ARCH_LLAMA4 && use_rope) {
+                if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
                     // Llama4TextL2Norm
-                    // TODO @ngxson : the 128E model does not use qk_norm
                     Qcur = ggml_rms_norm(ctx0, Qcur, 1e-6);
                     Kcur = ggml_rms_norm(ctx0, Kcur, 1e-6);
                     cb(Qcur, "Qcur_normed", il);
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -87,6 +87,7 @@ enum llm_type {
     LLM_TYPE_27B,
     LLM_TYPE_290B,
     LLM_TYPE_17B_16E, // llama4 Scout
+    LLM_TYPE_17B_128E, // llama4 Maverick
 };
 
 struct llama_layer_posnet {