Revert "llama : remove deprecated llama_kv_self API (ggml-org#15472)"

Nexesenex · Nexesenex · commit 909c5c051ea6 · 2025-10-06T21:47:34.000+02:00
This reverts commit cd36b5e.
diff --git a/include/llama.h b/include/llama.h
@@ -666,6 +666,111 @@ extern "C" {
     // Check if the memory supports shifting
     LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
 
+    //
+    // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
+    //
+
+    // Returns the number of tokens in the KV cache (slow, use only for debug)
+    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+    DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
+               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
+
+    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
+    DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
+               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
+
+    // Clear the KV cache - both cell info is erased and KV data is zeroed
+    DEPRECATED(LLAMA_API void llama_kv_self_clear(
+                struct llama_context * ctx),
+            "Use llama_memory_clear() instead");
+
+    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
+    // seq_id < 0 : match any sequence
+    // p0 < 0     : [0,  p1]
+    // p1 < 0     : [p0, inf)
+    DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "Use llama_memory_seq_rm() instead");
+
+    // Copy all tokens that belong to the specified sequence to another sequence
+    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "Use llama_memory_seq_cp() instead");
+
+    // Removes all tokens that do not belong to the specified sequence
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "Use llama_memory_seq_keep() instead");
+
+    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // If the KV cache is RoPEd, the KV data is updated accordingly:
+    //   - lazily on next llama_decode()
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta),
+            "Use llama_memory_seq_add() instead");
+
+    // Integer division of the positions by factor of `d > 1`
+    // If the KV cache is RoPEd, the KV data is updated accordingly:
+    //   - lazily on next llama_decode()
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d),
+            "Use llama_memory_seq_div() instead");
+
+    // Returns the smallest position present in the KV cache for the specified sequence
+    // This is typically non-zero only for SWA caches
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
+    // Return -1 if the sequence is empty
+    DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "Use llama_memory_seq_pos_min() instead");
+
+    // Returns the largest position present in the KV cache for the specified sequence
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
+    // Return -1 if the sequence is empty
+    DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "Use llama_memory_seq_pos_max() instead");
+
+    // Defragment the KV cache
+    // This will be applied:
+    //   - lazily on next llama_decode()
+    DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
+            "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
+
+    // Check if the context supports KV cache shifting
+    DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
+            "use llama_memory_can_shift() instead");
+
+    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
+    DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
+            "simply remove this call, updates are applied lazily on the next llama_decode()");
+
     //
     // State / sessions
     //
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -93,7 +93,7 @@ llama_context::llama_context(
     // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
     // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
     // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
+    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
     if (cparams.n_batch < GGML_KQ_MASK_PAD) {
         LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
         cparams.n_batch = GGML_KQ_MASK_PAD;
@@ -439,12 +439,26 @@ llama_memory_t llama_context::get_memory() const {
     return memory.get();
 }
 
-bool llama_context::memory_update(bool optimize) {
+// deprecated
+void llama_context::kv_self_defrag_sched() {
+    if (!memory) {
+        return;
+    }
+
+    memory_force_optimize = true;
+}
+
+// deprecated
+bool llama_context::kv_self_update(bool optimize) {
     if (!memory) {
         return false;
     }
 
     {
+        // TODO: remove in the future
+        optimize |= memory_force_optimize;
+        memory_force_optimize = false;
+
         const auto mctx = memory->init_update(this, optimize);
         switch (mctx->get_status()) {
             case LLAMA_MEMORY_STATUS_SUCCESS:
@@ -979,7 +993,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     bool did_optimize = false;
 
     // handle any pending defrags/shifts
-    memory_update(false);
+    kv_self_update(false);
 
     llama_memory_context_ptr mctx;
 
@@ -1004,7 +1018,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                     if (!did_optimize) {
                         did_optimize = true;
 
-                        if (memory_update(true)) {
+                        if (kv_self_update(true)) {
                             LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
 
                             continue;
@@ -2324,6 +2338,11 @@ const llama_model * llama_get_model(const llama_context * ctx) {
     return &ctx->get_model();
 }
 
+// deprecated
+void llama_kv_self_update(llama_context * ctx) {
+    ctx->kv_self_update(false);
+}
+
 enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
     return ctx->pooling_type();
 }
@@ -2541,6 +2560,168 @@ bool llama_memory_can_shift(llama_memory_t mem) {
     return mem->get_can_shift();
 }
 
+//
+// kv cache
+//
+
+// deprecated
+int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
+    const auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return 0;
+    }
+
+    int32_t res = 0;
+
+    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
+        const llama_pos p0 = kv->seq_pos_min(s);
+        const llama_pos p1 = kv->seq_pos_max(s);
+
+        if (p0 >= 0) {
+            res += (p1 - p0) + 1;
+        }
+    }
+
+    return res;
+}
+
+// deprecated
+// note: this is the same as above - will be removed anyway, so it's ok
+int32_t llama_kv_self_used_cells(const llama_context * ctx) {
+    const auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return 0;
+    }
+
+    int32_t res = 0;
+
+    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
+        const llama_pos p0 = kv->seq_pos_min(s);
+        const llama_pos p1 = kv->seq_pos_max(s);
+
+        if (p0 >= 0) {
+            res += (p1 - p0) + 1;
+        }
+    }
+
+    return res;
+}
+
+// deprecated
+void llama_kv_self_clear(llama_context * ctx) {
+    auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return;
+    }
+
+    llama_memory_clear(kv, true);
+}
+
+// deprecated
+bool llama_kv_self_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return true;
+    }
+
+    return llama_memory_seq_rm(kv, seq_id, p0, p1);
+}
+
+// deprecated
+void llama_kv_self_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return;
+    }
+
+    llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1);
+}
+
+// deprecated
+void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
+    auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return;
+    }
+
+    llama_memory_seq_keep(kv, seq_id);
+}
+
+// deprecated
+void llama_kv_self_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return;
+    }
+
+    llama_memory_seq_add(kv, seq_id, p0, p1, delta);
+}
+
+// deprecated
+void llama_kv_self_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return;
+    }
+
+    llama_memory_seq_div(kv, seq_id, p0, p1, d);
+}
+
+// deprecated
+llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
+    auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return -1;
+    }
+
+    return llama_memory_seq_pos_min(kv, seq_id);
+}
+
+// deprecated
+llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return -1;
+    }
+
+    return llama_memory_seq_pos_max(kv, seq_id);
+}
+
+// deprecated
+void llama_kv_self_defrag(llama_context * ctx) {
+    // force defrag
+    ctx->kv_self_defrag_sched();
+}
+
+// deprecated
+bool llama_kv_self_can_shift(const llama_context * ctx) {
+    auto * kv = llama_get_memory(ctx);
+    if (!kv) {
+        return false;
+    }
+
+    return llama_memory_can_shift(kv);
+}
+
 // llama state API
 
 // deprecated
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -46,8 +46,10 @@ struct llama_context {
 
     llama_memory_t get_memory() const;
 
-    // return true if the memory was updated
-    bool memory_update(bool optimize);
+    // return true of the KV cache was updated
+    // TODO: remove
+    bool kv_self_update(bool optimize);
+    void kv_self_defrag_sched();
 
     enum llama_pooling_type pooling_type() const;
 
@@ -228,6 +230,9 @@ struct llama_context {
 
     std::unique_ptr<llama_memory_i> memory;
 
+    // TODO: temporary, until the llama_kv_self_defrag() API is removed
+    bool memory_force_optimize = false;
+
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
     float * logits      = nullptr;