refactor!: Rename all k/v related values for recurrent/hybrid to r/s

gabe-l-hart · gabe-l-hart · commit 8e39e04b8874 · 2025-06-18T09:11:40.000-06:00
Anywhere that "kv_&lt;state|cell|size|etc&gt;" is used, I've used the more
generic "mem_" prefix. The specifics of "k" (key) translate to "r"
(recurrent state) and "v" (value) translate to "s" (state-space embedding
states).

Branch: HybridRecurrentCache

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -242,15 +242,15 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
 void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
     GGML_UNUSED(ubatch);
 
-    const int64_t n_kv = kv_state->get_n_kv();
+    const int64_t n_rs = mem_state->get_n_rs();
 
     if (s_copy) {
         GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
         int32_t * data = (int32_t *) s_copy->data;
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            data[i] = kv_state->s_copy(i);
+        for (uint32_t i = 0; i < n_rs; ++i) {
+            data[i] = mem_state->s_copy(i);
         }
     }
 }
@@ -406,18 +406,18 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
 
 void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
     if (self_kq_mask) {
-        kv_state->get_state_attn()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+        mem_state->get_state_attn()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
     }
 
-    const int64_t n_kv = kv_state->get_state_recurrent()->get_n_kv();
+    const int64_t n_rs = mem_state->get_state_recurrent()->get_n_rs();
 
     if (s_copy) {
         GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
         int32_t * data = (int32_t *) s_copy->data;
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            data[i] = kv_state->get_state_recurrent()->s_copy(i);
+        for (uint32_t i = 0; i < n_rs; ++i) {
+            data[i] = mem_state->get_state_recurrent()->s_copy(i);
         }
     }
 }
@@ -1050,14 +1050,14 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t
 }
 
 llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
-    const auto * kv_state = static_cast<const llama_memory_hybrid_state *>(mstate);
+    const auto * mem_state = static_cast<const llama_memory_hybrid_state *>(mstate);
 
-    auto inp = std::make_unique<llm_graph_input_mem_hybrid>(hparams, cparams, kv_state);
+    auto inp = std::make_unique<llm_graph_input_mem_hybrid>(hparams, cparams, mem_state);
 
     {
         GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Hybrid recurrent is not supported with SWA attention layers");
 
-        const auto n_kv = inp->kv_state->get_state_attn()->get_n_kv();
+        const auto n_kv = inp->mem_state->get_state_attn()->get_n_kv();
 
         inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask, "KQ_mask", -1);
@@ -1067,9 +1067,9 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
     }
 
     {
-        const auto n_kv = kv_state->get_state_recurrent()->get_n_kv();
+        const auto n_rs = mem_state->get_state_recurrent()->get_n_rs();
 
-        inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
+        inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
         ggml_set_input(inp->s_copy);
     }
 
@@ -1557,9 +1557,9 @@ llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
 
     auto inp = std::make_unique<llm_graph_input_rs>(kv_state);
 
-    const auto n_kv = kv_state->get_n_kv();
+    const auto n_rs = kv_state->get_n_rs();
 
-    inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
+    inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
     ggml_set_input(inp->s_copy);
 
     return (llm_graph_input_rs *) res->add_input(std::move(inp));
@@ -1574,7 +1574,7 @@ ggml_tensor * llm_graph_context::build_rs(
                bool   avoid_copies) const {
     const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
 
-    return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_kv(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), avoid_copies);
+    return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), avoid_copies);
 }
 
 ggml_tensor * llm_graph_context::build_rs(
@@ -1586,7 +1586,7 @@ ggml_tensor * llm_graph_context::build_rs(
                bool   avoid_copies) const {
     const auto * kv_state = static_cast<const llama_memory_hybrid_state *>(mstate)->get_state_recurrent();
 
-    return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_kv(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), avoid_copies);
+    return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), avoid_copies);
 }
 
 ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
@@ -1600,11 +1600,11 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
 
     const int64_t n_seqs  = ubatch.n_seqs;
 
-    ggml_tensor * token_shift_all = kv_state->get_k_l(il);
+    ggml_tensor * token_shift_all = kv_state->get_r_l(il);
 
     ggml_tensor * token_shift = build_rs(
             inp, gf, token_shift_all,
-            hparams.n_embd_k_s(), n_seqs);
+            hparams.n_embd_r(), n_seqs);
 
     token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
 
@@ -1627,7 +1627,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
     return ggml_cpy(
         ctx0,
         ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
-        ggml_view_1d(ctx0, kv_state->get_k_l(il), hparams.n_embd_k_s()*n_seqs, hparams.n_embd_k_s()*kv_head*ggml_element_size(kv_state->get_k_l(il)))
+        ggml_view_1d(ctx0, kv_state->get_r_l(il), hparams.n_embd_r()*n_seqs, hparams.n_embd_r()*kv_head*ggml_element_size(kv_state->get_r_l(il)))
     );
 }
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -191,14 +191,14 @@ class llm_graph_input_cls : public llm_graph_input_i {
 
 class llm_graph_input_rs : public llm_graph_input_i {
 public:
-    llm_graph_input_rs(const llama_memory_recurrent_state * kv_state) : kv_state(kv_state) {}
+    llm_graph_input_rs(const llama_memory_recurrent_state * mem_state) : mem_state(mem_state) {}
     virtual ~llm_graph_input_rs() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * s_copy; // I32 [kv_size]
 
-    const llama_memory_recurrent_state * kv_state;
+    const llama_memory_recurrent_state * mem_state;
 };
 
 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -306,10 +306,10 @@ class llm_graph_input_mem_hybrid : public llm_graph_input_i {
     llm_graph_input_mem_hybrid(
             const llama_hparams & hparams,
             const llama_cparams & cparams,
-            const llama_memory_hybrid_state * kv_state) :
+            const llama_memory_hybrid_state * mem_state) :
         hparams(hparams),
         cparams(cparams),
-        kv_state(kv_state) {
+        mem_state(mem_state) {
     }
     virtual ~llm_graph_input_mem_hybrid() = default;
 
@@ -325,7 +325,7 @@ class llm_graph_input_mem_hybrid : public llm_graph_input_i {
     const llama_hparams & hparams;
     const llama_cparams & cparams;
 
-    const llama_memory_hybrid_state * kv_state;
+    const llama_memory_hybrid_state * mem_state;
 };
 
 //
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
@@ -65,7 +65,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
     return n_embd_head_v * n_head_kv;
 }
 
-uint32_t llama_hparams::n_embd_k_s() const {
+uint32_t llama_hparams::n_embd_r() const {
     if (wkv_head_size != 0) {
         // for RWKV models
         return token_shift_count * n_embd;
@@ -76,7 +76,7 @@ uint32_t llama_hparams::n_embd_k_s() const {
     return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
 }
 
-uint32_t llama_hparams::n_embd_v_s() const {
+uint32_t llama_hparams::n_embd_s() const {
     if (wkv_head_size != 0) {
         // corresponds to RWKV's wkv_states size
         return n_embd * wkv_head_size;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -184,10 +184,10 @@ struct llama_hparams {
 
     // dimension of the rolling state embeddings
     // corresponds to Mamba's conv_states size or RWKV's token_shift states size
-    uint32_t n_embd_k_s() const;
+    uint32_t n_embd_r() const;
 
     // dimension of the recurrent state embeddings
-    uint32_t n_embd_v_s() const;
+    uint32_t n_embd_s() const;
 
     // whether or not the given layer is recurrent (for hybrid models)
     bool recurrent_layer(uint32_t il) const;
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp
@@ -11,48 +11,48 @@
 llama_memory_hybrid::llama_memory_hybrid(
     const llama_model & model,
                          /* attn */
-            ggml_type    attn_type_k,
-            ggml_type    attn_type_v,
-                 bool    attn_v_trans,
-             uint32_t    attn_kv_size,
-             uint32_t    attn_n_pad,
-             uint32_t    attn_n_swa,
-       llama_swa_type    attn_swa_type,
+            ggml_type    type_k,
+            ggml_type    type_v,
+                 bool    v_trans,
+             uint32_t    kv_size,
+             uint32_t    n_pad,
+             uint32_t    n_swa,
+       llama_swa_type    swa_type,
                          /* recurrent */
-            ggml_type    recurrent_type_k,
-            ggml_type    recurrent_type_v,
-             uint32_t    recurrent_kv_size,
+            ggml_type    type_r,
+            ggml_type    type_s,
+             uint32_t    rs_size,
                          /* common */
              uint32_t    n_seq_max,
                  bool    offload,
                          /* layer filters */
-      layer_filter_cb && attn_filter,
-      layer_filter_cb && recurrent_filter) :
+      layer_filter_cb && filter_attn,
+      layer_filter_cb && filter_recurrent) :
     hparams(model.hparams),
     mem_attn(new llama_kv_cache_unified(
         model,
-        attn_filter == nullptr ?
+        filter_attn == nullptr ?
             [&](int32_t il) { return !model.hparams.recurrent_layer(il); }
-            : attn_filter,
-        attn_type_k,
-        attn_type_v,
-        attn_v_trans,
+            : filter_attn,
+        type_k,
+        type_v,
+        v_trans,
         offload,
-        attn_kv_size,
+        kv_size,
         n_seq_max,
-        attn_n_pad,
-        attn_n_swa,
-        attn_swa_type
+        n_pad,
+        n_swa,
+        swa_type
     )),
     mem_recurrent(new llama_memory_recurrent(
         model,
-        recurrent_filter == nullptr ?
+        filter_recurrent == nullptr ?
             [&](int32_t il) { return model.hparams.recurrent_layer(il); }
-            : recurrent_filter,
-        recurrent_type_k,
-        recurrent_type_v,
+            : filter_recurrent,
+        type_r,
+        type_s,
         offload,
-        recurrent_kv_size,
+        rs_size,
         n_seq_max
     )) {}
 
diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h
@@ -25,23 +25,23 @@ class llama_memory_hybrid : public llama_memory_i {
     llama_memory_hybrid(
         const llama_model & model,
                             /* attn */
-                ggml_type    attn_type_k,
-                ggml_type    attn_type_v,
-                     bool    attn_v_trans,
-                 uint32_t    attn_kv_size,
-                 uint32_t    attn_n_pad,
-                 uint32_t    attn_n_swa,
-           llama_swa_type    attn_swa_type,
+                ggml_type    type_k,
+                ggml_type    type_v,
+                     bool    v_trans,
+                 uint32_t    kv_size,
+                 uint32_t    n_pad,
+                 uint32_t    n_swa,
+           llama_swa_type    swa_type,
                              /* recurrent */
-                ggml_type    recurrent_type_k,
-                ggml_type    recurrent_type_v,
-                 uint32_t    recurrent_kv_size,
+                ggml_type    type_r,
+                ggml_type    type_s,
+                 uint32_t    rs_size,
                              /* common */
                  uint32_t    n_seq_max,
                      bool    offload,
                              /* layer filters */
-          layer_filter_cb && attn_filter      = nullptr,
-          layer_filter_cb && recurrent_filter = nullptr);
+          layer_filter_cb && filter_attn      = nullptr,
+          layer_filter_cb && filter_recurrent = nullptr);
 
     ~llama_memory_hybrid() = default;
 
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h
diff --git a/src/llama-model.cpp b/src/llama-model.cpp