fix: Use per-layer n_embd_k/v_s calls for mamba (1) layers

gabe-l-hart · gabe-l-hart · commit bb87dbf621d1 · 2025-06-10T16:30:49.000-06:00
Branch: HybridRecurrentCache

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -8939,11 +8939,11 @@ struct llm_build_mamba : public llm_graph_context {
         // (ab)using the KV cache to store the states
         ggml_tensor * conv = build_recurrent_state(
                 gf, conv_states_all, state_copy,
-                hparams.n_embd_k_s(), n_seqs);
+                hparams.n_embd_k_s(il), n_seqs);
         conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
         ggml_tensor * ssm = build_recurrent_state(
                 gf, ssm_states_all, state_copy,
-                hparams.n_embd_v_s(), n_seqs);
+                hparams.n_embd_v_s(il), n_seqs);
         ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
 
         // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}