Apply suggestions from code review

CISC · web-flow · commit f5df812c90a4 · 2025-08-04T11:24:39.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -6588,7 +6588,7 @@ class Glm4MoeModel(TextModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
-        self.block_count = self.hparams["num_hidden_layers"] + 1
+        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
 
     def set_vocab(self):
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -105,7 +105,7 @@ class LLM:
         EXPERT_WEIGHTS_NORM               = "{arch}.expert_weights_norm"
         EXPERT_GATING_FUNC                = "{arch}.expert_gating_func"
         MOE_EVERY_N_LAYERS                = "{arch}.moe_every_n_layers"
-        NEXTN_PREDICT_LAYERS              = "{arch}.num_nextn_predict_layers"
+        NEXTN_PREDICT_LAYERS              = "{arch}.nextn_predict_layers"
         POOLING_TYPE                      = "{arch}.pooling_type"
         LOGIT_SCALE                       = "{arch}.logit_scale"
         DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1439,7 +1439,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 }
 
                 // NextN/MTP parameters
-                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,    hparams.nextn_predict_layers, false);
+                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
 
                 switch (hparams.n_layer) {
                     case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
@@ -4394,7 +4394,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
                     }
 
-
                     // Load ALL tensors including NextN layer to satisfy total tensor count
                     // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
                     for (int i = 0; i < n_layer; ++i) {
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -2185,7 +2185,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<｜fim▁begin｜>" // DeepSeek
                         || t.first == "<PRE>"
                         || t.first == "▁<PRE>"          // CodeLlama
-                        || t.first == "<|code_prefix|>" // GLM4_MOE
+                        || t.first == "<|code_prefix|>" // GLM-4.5
                         ) {
                     special_fim_pre_id = t.second;
                     if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2205,7 +2205,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<｜fim▁hole｜>" // DeepSeek
                         || t.first == "<SUF>"
                         || t.first == "▁<SUF>"         // CodeLlama
-                        || t.first == "<|code_suffix|>" // GLM4_MOE
+                        || t.first == "<|code_suffix|>" // GLM-4.5
                         ) {
                     special_fim_suf_id = t.second;
                     if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2225,7 +2225,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<｜fim▁end｜>"  // DeepSeek
                         || t.first == "<MID>"
                         || t.first == "▁<MID>"         // CodeLlama
-                        || t.first == "<|code_middle|>" // GLM4_MOE
+                        || t.first == "<|code_middle|>" // GLM-4.5
                         ) {
                     special_fim_mid_id = t.second;
                     if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {

Original file line number	Diff line number	Diff line change
`@@ -1439,7 +1439,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {`
`1439`	`1439`	`}`
`1440`	`1440`
`1441`	`1441`	`// NextN/MTP parameters`
`1442`		`- ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);`
	`1442`	`+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);`
`1443`	`1443`
`1444`	`1444`	`switch (hparams.n_layer) {`
`1445`	`1445`	`case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)`
`@@ -4394,7 +4394,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {`
`4394`	`4394`	`output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);`
`4395`	`4395`	`}`
`4396`	`4396`
`4397`		`-`
`4398`	`4397`	`// Load ALL tensors including NextN layer to satisfy total tensor count`
`4399`	`4398`	`// but only PROCESS up to last layer (skipping final NextN layer) in forward pass`
`4400`	`4399`	`for (int i = 0; i < n_layer; ++i) {`