feat: Add layer filter to recurrent cache

gabe-l-hart · gabe-l-hart · commit 52d762789af8 · 2025-05-27T09:03:17.000-06:00
Branch: HybridCache

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -1740,12 +1740,13 @@ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_swa() const {
 //
 
 llama_kv_cache_recurrent::llama_kv_cache_recurrent(
-        const llama_model & model,
-                ggml_type   type_k,
-                ggml_type   type_v,
-                     bool   offload,
-                 uint32_t   kv_size,
-                 uint32_t   n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
+        const llama_model &  model,
+          layer_filter_cb && filter,
+                ggml_type    type_k,
+                ggml_type    type_v,
+                     bool    offload,
+                 uint32_t    kv_size,
+                 uint32_t    n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
     const int32_t n_layer = hparams.n_layer;
 
     LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
@@ -1787,6 +1788,11 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
     v_l.reserve(n_layer);
 
     for (int i = 0; i < n_layer; i++) {
+        if (filter && !filter(i)) {
+            LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, i);
+            continue;
+        }
+
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -412,12 +412,13 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
     };
 
     llama_kv_cache_recurrent(
-            const llama_model & model,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                         bool   offload,
-                     uint32_t   kv_size,
-                     uint32_t   n_seq_max);
+            const llama_model &  model,
+              layer_filter_cb && filter,
+                    ggml_type    type_k,
+                    ggml_type    type_v,
+                         bool    offload,
+                     uint32_t    kv_size,
+                     uint32_t    n_seq_max);
 
     ~llama_kv_cache_recurrent() = default;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13208,6 +13208,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
             {
                 res = new llama_kv_cache_recurrent(
                         *this,
+                        nullptr,
                         GGML_TYPE_F32,
                         GGML_TYPE_F32,
                         cparams.offload_kqv,
diff --git a/tests/test-memory.cpp b/tests/test-memory.cpp
@@ -209,6 +209,7 @@ static void test_llama_kv_cache_recurrent_constructor() {
     auto model = _make_model(LLM_ARCH_MAMBA);
     llama_kv_cache_recurrent cache(
         /* model     */ *model,
+        /* filter    */ nullptr,
         /* type_k    */ GGML_TYPE_F32,
         /* type_v    */ GGML_TYPE_F16,
         /* offload   */ false,

Original file line number	Diff line number	Diff line change
`@@ -13208,6 +13208,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,`
`13208`	`13208`	`{`
`13209`	`13209`	`res = new llama_kv_cache_recurrent(`
`13210`	`13210`	`*this,`
	`13211`	`+ nullptr,`
`13211`	`13212`	`GGML_TYPE_F32,`
`13212`	`13213`	`GGML_TYPE_F32,`
`13213`	`13214`	`cparams.offload_kqv,`