feat: Add split_equal to init(...) signature

gabe-l-hart · gabe-l-hart · commit 8aee2e785d20 · 2025-05-27T16:52:23.000-06:00
This will enable the hybrid cache to control the split type for all
children together.

Branch: HybridCache

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -352,14 +352,19 @@ llama_memory_decode_state_ptr llama_kv_cache_unified::init(
             const llama_batch & batch,
             uint32_t n_ubatch,
             bool embd_pooled,
-            bool logits_all) {
+            bool logits_all,
+            bool split_equal) {
     GGML_UNUSED(embd_pooled);
 
     auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
 
     std::vector<llama_ubatch> ubatches;
     while (sbatch.n_tokens > 0) {
-        ubatches.push_back(sbatch.split_simple(n_ubatch));
+        if (split_equal) {
+            ubatches.push_back(sbatch.split_equal(n_ubatch));
+        } else {
+            ubatches.push_back(sbatch.split_simple(n_ubatch));
+        }
     }
 
     auto heads = prepare(ubatches);
@@ -1821,17 +1826,24 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
     return kv_swa->seq_pos_max(seq_id);
 }
 
-llama_memory_decode_state_ptr llama_kv_cache_unified_iswa::init(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
+llama_memory_decode_state_ptr llama_kv_cache_unified_iswa::init(
+    const llama_batch & batch,
+    uint32_t n_ubatch,
+    bool embd_pooled,
+    bool logits_all,
+    bool split_equal) {
     GGML_UNUSED(embd_pooled);
 
     auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
 
     std::vector<llama_ubatch> ubatches;
 
     while (sbatch.n_tokens > 0) {
-        auto ubatch = sbatch.split_simple(n_ubatch);
-
-        ubatches.push_back(ubatch);
+        if (split_equal) {
+            ubatches.push_back(sbatch.split_equal(n_ubatch));
+        } else {
+            ubatches.push_back(sbatch.split_simple(n_ubatch));
+        }
     }
 
     auto heads_base = kv_base->prepare(ubatches);
@@ -2291,8 +2303,15 @@ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
     return result;
 }
 
-llama_memory_decode_state_ptr llama_kv_cache_recurrent::init(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
+llama_memory_decode_state_ptr llama_kv_cache_recurrent::init(
+    const llama_batch & batch,
+    uint32_t n_ubatch,
+    bool embd_pooled,
+    bool logits_all,
+    bool split_equal) {
     GGML_UNUSED(embd_pooled);
+    // TODO: Should this just be ignored?
+    assert(split_equal);
 
     auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all);
 
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -34,7 +34,8 @@ struct llama_kv_cache : public llama_memory_i {
             const llama_batch & batch,
             uint32_t n_ubatch,
             bool embd_pooled,
-            bool logits_all) = 0;
+            bool logits_all,
+            bool split_equal = false) = 0;
 
     // process any pending defrag/shift/etc. operations
     // optionally call once before processing a new batch
@@ -112,7 +113,8 @@ class llama_kv_cache_unified : public llama_kv_cache {
             const llama_batch & batch,
             uint32_t n_ubatch,
             bool embd_pooled,
-            bool logits_all) override;
+            bool logits_all,
+            bool split_equal = false) override;
 
     bool update(llama_context & lctx) override;
 
@@ -289,7 +291,8 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
             const llama_batch & batch,
             uint32_t n_ubatch,
             bool embd_pooled,
-            bool logits_all) override;
+            bool logits_all,
+            bool split_equal = false) override;
 
     bool update(llama_context & lctx) override;
 
@@ -360,7 +363,8 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
             const llama_batch & batch,
             uint32_t n_ubatch,
             bool embd_pooled,
-            bool logits_all) override;
+            bool logits_all,
+            bool split_equal = true) override;
 
     bool update(llama_context & lctx) override;