Skip to content

Commit 22540ce

Browse files
ikawrakowIwan Kawrakow
andauthored
Do not allocate KV cache for unused layers (#843)
* Do not allocate KV cache for unused layers * Do not apply experts weight scale if it is 1 --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 1789de5 commit 22540ce

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

src/llama-build-context.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -863,7 +863,7 @@ llm_expert_gating_func_type gating_op,
863863

864864
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
865865
}
866-
if (scale_w) {
866+
if (scale_w && std::abs(w_scale-1) > 1e-5f) {
867867
weights = ggml_scale(ctx, weights, w_scale);
868868
cb(weights, "ffn_moe_weights_scaled", il);
869869
}

src/llama.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ static bool llama_kv_cache_init(
532532

533533
const struct llama_hparams & hparams = model.hparams;
534534

535-
const int64_t n_layer = hparams.n_layer;
535+
const int64_t n_layer = hparams.n_layer - hparams.nextn_predict_layers;
536536

537537
cache.has_shift = false;
538538

0 commit comments

Comments
 (0)