Do not allocate KV cache for unused layers (#843)

ikawrakow · Iwan Kawrakow · web-flow · commit 22540cee603b · 2025-10-20T10:09:39.000+03:00
* Do not allocate KV cache for unused layers

* Do not apply experts weight scale if it is 1

---------

Co-authored-by: Iwan Kawrakow &lt;iwan.kawrakow@gmail.com&gt;
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
@@ -863,7 +863,7 @@ llm_expert_gating_func_type   gating_op,
 
         weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
     }
-    if (scale_w) {
+    if (scale_w && std::abs(w_scale-1) > 1e-5f) {
         weights = ggml_scale(ctx, weights, w_scale);
         cb(weights, "ffn_moe_weights_scaled", il);
     }
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -532,7 +532,7 @@ static bool llama_kv_cache_init(
 
     const struct llama_hparams & hparams = model.hparams;
 
-    const int64_t  n_layer = hparams.n_layer;
+    const int64_t  n_layer = hparams.n_layer - hparams.nextn_predict_layers;
 
     cache.has_shift = false;
 

Original file line number	Diff line number	Diff line change
`@@ -863,7 +863,7 @@ llm_expert_gating_func_type gating_op,`
`863`	`863`
`864`	`864`	`weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);`
`865`	`865`	`}`
`866`		`- if (scale_w) {`
	`866`	`+ if (scale_w && std::abs(w_scale-1) > 1e-5f) {`
`867`	`867`	`weights = ggml_scale(ctx, weights, w_scale);`
`868`	`868`	`cb(weights, "ffn_moe_weights_scaled", il);`
`869`	`869`	`}`