model: glm 4.5 apply suggestions from code review

sammcj · CISC · web-flow · commit 21b10415cc50 · 2025-08-04T17:44:50.000+10:00
Co-authored-by: Sigbjørn Skjæret &lt;sigbjorn.skjaeret@scala.com&gt;
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -126,7 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EXPERT_WEIGHTS_NORM,               "%s.expert_weights_norm"               },
     { LLM_KV_EXPERT_GATING_FUNC,                "%s.expert_gating_func"                },
     { LLM_KV_MOE_EVERY_N_LAYERS,                "%s.moe_every_n_layers"                },
-    { LLM_KV_NEXTN_PREDICT_LAYERS,              "%s.nextn_predict_layers"               },
+    { LLM_KV_NEXTN_PREDICT_LAYERS,              "%s.nextn_predict_layers"              },
     { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
     { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
     { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -41,7 +41,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
     }
     if (model.arch == LLM_ARCH_GLM4_MOE) {
         // GLM-4.5: Only process up to last layer, skip final NextN layer
-        n_layer_cache = hparams.n_layer - hparam.nextn_predict_layers;
+        n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
     }
 
     // create a context for each buffer type
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13494,7 +13494,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
 
         // Only process up to last layer (skip final NextN layer)
         // Final layer tensors are loaded but not processed in forward pass
-        const int n_transformer_layers = n_layer - hparam.nextn_predict_layers;
+        const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
         for (int il = 0; il < n_transformer_layers; ++il) {
             ggml_tensor * inpSA = inpL;
 

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(`
`41`	`41`	`}`
`42`	`42`	`if (model.arch == LLM_ARCH_GLM4_MOE) {`
`43`	`43`	`// GLM-4.5: Only process up to last layer, skip final NextN layer`
`44`		`- n_layer_cache = hparams.n_layer - hparam.nextn_predict_layers;`
	`44`	`+ n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;`
`45`	`45`	`}`
`46`	`46`
`47`	`47`	`// create a context for each buffer type`