kv-cache : add ubatch_next()

ggerganov · ggerganov · commit 2a89bac18d3b · 2025-04-23T10:33:38.000+03:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1250,22 +1250,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     int64_t n_outputs_prev = 0;
 
     while (sbatch.n_tokens > 0) {
-        llama_ubatch ubatch = llama_ubatch();
-
-        const auto & n_ubatch = cparams.n_ubatch;
-
-        if (is_recurrent) {
-            if (embd_pooled) {
-                // Pooled embeddings cannot be split across ubatches (yet)
-                ubatch = sbatch.split_seq(cparams.n_ubatch);
-            } else {
-                // recurrent model architectures are easier to implement
-                // with equal-length sequences
-                ubatch = sbatch.split_equal(cparams.n_ubatch);
-            }
-        } else {
-            ubatch = sbatch.split_simple(n_ubatch);
-        }
+        llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
 
         // count the outputs in this u_batch
         {
@@ -1435,7 +1420,7 @@ int llama_context::decode(llama_batch & inp_batch) {
 
         // - do not defrag small contexts (i.e. < 2048 tokens)
         // - count the padding towards the number of used tokens
-        const float fragmentation = kv->n >= 2048 ? std::max(0.0f, 1.0f - float(kv->used + kv->get_padding(cparams))/float(kv->n)) : 0.0f;
+        const float fragmentation = kv->n >= 2048 ? std::max(0.0f, 1.0f - float(kv->used + kv->padding)/float(kv->n)) : 0.0f;
 
         // queue defragmentation for next llama_kv_cache_update
         if (fragmentation > cparams.defrag_thold) {
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -476,6 +476,14 @@ bool llama_kv_cache_unified::find_slot(
     return true;
 }
 
+llama_ubatch llama_kv_cache_unified::ubatch_next(
+        llama_sbatch & sbatch,
+        uint32_t n_ubatch,
+        bool embd_pooled) const {
+    GGML_UNUSED(embd_pooled);
+    return sbatch.split_simple(n_ubatch);
+}
+
 uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
     // the FA kernels require padding to avoid extra runtime boundary checks
     return cparams.flash_attn ? 256u : 32u;
@@ -1539,6 +1547,15 @@ bool llama_kv_cache_recurrent::find_slot(
     return n >= n_seqs;
 }
 
+llama_ubatch llama_kv_cache_recurrent::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
+    if (embd_pooled) {
+        // Pooled embeddings cannot be split across ubatches (yet)
+        return sbatch.split_seq(n_ubatch);
+    }
+
+    return sbatch.split_equal(n_ubatch);
+}
+
 uint32_t llama_kv_cache_recurrent::cell_max() const {
     for (uint32_t i = size; i > 0; --i) {
         const llama_kv_cell & cell = cells[i - 1];
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -13,6 +13,7 @@
 struct llama_cparams;
 struct llama_hparams;
 struct llama_ubatch;
+struct llama_sbatch;
 
 struct llama_kv_cache : public llama_memory_i {
     // can be used to query data from the model if needed
@@ -44,6 +45,9 @@ struct llama_kv_cache : public llama_memory_i {
 
     virtual bool find_slot(const llama_ubatch & batch) = 0;
 
+    // different KV caches require different batch splitting strategies
+    virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
+
     // simulate full cache, used for allocating worst-case compute buffers
     virtual void set_full() = 0;
 
@@ -139,6 +143,8 @@ class llama_kv_cache_unified : public llama_kv_cache {
     // to the first cell of the slot.
     bool find_slot(const llama_ubatch & batch) override;
 
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+
     static uint32_t get_padding(const llama_cparams & cparams);
 
     // find how many cells are currently in use
@@ -263,6 +269,8 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
     // to the first cell of the slot.
     bool find_slot(const llama_ubatch & batch) override;
 
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+
     // find how many cells are currently in use
     uint32_t cell_max() const;