feat: Support hybrid recurrent cache in llm_graph_context

gabe-l-hart · gabe-l-hart · commit 50b8ad4808cb · 2025-06-03T16:26:30.000-06:00
Branch: HybridRecurrentCache

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -7,6 +7,7 @@
 #include "llama-kv-cache-unified.h"
 #include "llama-kv-cache-unified-iswa.h"
 #include "llama-kv-cache-recurrent.h"
+#include "llama-kv-cache-hybrid-recurrent.h"
 
 #include <cassert>
 #include <cmath>
@@ -957,7 +958,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_s_copy() const {
-    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+    const auto * kv_state = get_state_recurrent();
 
     auto inp = std::make_unique<llm_graph_input_s_copy>(kv_state);
 
@@ -974,7 +975,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_s_mask() const {
-    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+    const auto * kv_state = get_state_recurrent();
 
     auto inp = std::make_unique<llm_graph_input_s_mask>(kv_state);
 
@@ -1028,7 +1029,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
-    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);
+    const auto * kv_state = get_state_unified();
 
     auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_state);
 
@@ -1059,6 +1060,30 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t
     return pos_bias;
 }
 
+const llama_kv_cache_unified_state * llm_graph_context::get_state_unified() const {
+    const auto * umstate = dynamic_cast<const llama_kv_cache_unified_state *>(mstate);
+    if (!umstate) {
+        const auto hmstate = dynamic_cast<const llama_kv_cache_hybrid_recurrent_state *>(mstate);
+        if (hmstate) {
+            umstate = hmstate->get_state_attn();
+        }
+    }
+    GGML_ASSERT(umstate);
+    return umstate;
+}
+
+const llama_kv_cache_recurrent_state * llm_graph_context::get_state_recurrent() const {
+    const auto * rmstate = dynamic_cast<const llama_kv_cache_recurrent_state *>(mstate);
+    if (!rmstate) {
+        const auto hmstate = dynamic_cast<const llama_kv_cache_hybrid_recurrent_state *>(mstate);
+        if (hmstate) {
+            rmstate = hmstate->get_state_recurrent();
+        }
+    }
+    GGML_ASSERT(rmstate);
+    return rmstate;
+}
+
 ggml_tensor * llm_graph_context::build_attn_mha(
          ggml_cgraph * gf,
          ggml_tensor * q,
@@ -1234,7 +1259,7 @@ ggml_tensor * llm_graph_context::build_attn(
 }
 
 llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
-    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);
+    const auto * kv_state = get_state_unified();
 
     auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_state);
 
@@ -1271,7 +1296,7 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_build_forward_expand(gf, k_cur);
     ggml_build_forward_expand(gf, v_cur);
 
-    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);
+    const auto * kv_state = get_state_unified();
 
     // store to KV cache
     {
@@ -1449,7 +1474,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
          ggml_tensor * state_mask,
              int32_t   n_state,
              int32_t   n_seqs) const {
-    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+    const auto * kv_state = get_state_recurrent();
 
     const auto n_kv    = kv_state->get_n_kv();
     const auto kv_head = kv_state->get_head();
@@ -1481,7 +1506,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+    const auto * kv_state = get_state_recurrent();
 
     const auto token_shift_count = hparams.token_shift_count;
 
@@ -1502,7 +1527,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+    const auto * kv_state = get_state_recurrent();
 
     const auto token_shift_count = hparams.token_shift_count;
     const auto n_embd = hparams.n_embd;
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -531,6 +531,8 @@ struct llm_graph_context {
     // attention
     //
 
+    const llama_kv_cache_unified_state * get_state_unified() const;
+
     ggml_tensor * build_attn_mha(
              ggml_cgraph * gf,
              ggml_tensor * q,       // [n_embd_head_q, n_head_q, n_tokens]
@@ -605,6 +607,8 @@ struct llm_graph_context {
     // recurrent
     //
 
+    const llama_kv_cache_recurrent_state * get_state_recurrent() const;
+
     ggml_tensor * build_copy_mask_state(
              ggml_cgraph * gf,
              ggml_tensor * s,