ggml-org
diff --git a/‎common/common.cpp
Lines changed: 4 additions & 2 deletions b/‎common/common.cpp
Lines changed: 4 additions & 2 deletions
diff --git a/‎common/speculative.cpp
Lines changed: 6 additions & 4 deletions b/‎common/speculative.cpp
Lines changed: 6 additions & 4 deletions
diff --git a/‎examples/embedding/embedding.cpp
Lines changed: 3 additions & 2 deletions b/‎examples/embedding/embedding.cpp
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/llama.h
Lines changed: 42 additions & 37 deletions b/‎include/llama.h
Lines changed: 42 additions & 37 deletions
diff --git a/‎src/llama-context.cpp
Lines changed: 10 additions & 6 deletions b/‎src/llama-context.cpp
Lines changed: 10 additions & 6 deletions
@@ -909,7 +909,9 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
-    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
+    llama_kv_cache * kv = llama_get_kv_cache(lctx);
+
+    if (params.ctx_shift && !llama_kv_cache_can_shift(kv)) {
         LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
         params.ctx_shift = false;
     }
@@ -1014,7 +1016,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_decoder(model)) {
             llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
-        llama_kv_cache_clear(lctx);
+        llama_kv_cache_clear(kv);
         llama_synchronize(lctx);
         llama_perf_context_reset(lctx);
     }
 
@@ -171,8 +171,10 @@ llama_tokens common_speculative_gen_draft(
     llama_tokens result;
     result.reserve(params.n_draft);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     if (reuse_n == 0) {
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         prompt.clear();
     } else {
@@ -191,14 +193,14 @@ llama_tokens common_speculative_gen_draft(
         }
 
         if (reuse_i > 0) {
-            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
-            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+            llama_kv_cache_seq_rm (kv, 0, 0, reuse_i);
+            llama_kv_cache_seq_add(kv, 0, reuse_i, -1, -reuse_i);
 
             prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
         }
 
         if (reuse_n < (int) prompt.size()) {
-            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
+            llama_kv_cache_seq_rm (kv, 0, reuse_n, -1);
 
             prompt.erase(prompt.begin() + reuse_n, prompt.end());
         }
 
@@ -34,10 +34,11 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-    const struct llama_model * model = llama_get_model(ctx);
+    const llama_model * model = llama_get_model(ctx);
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
+    llama_kv_cache_clear(kv);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
 
@@ -60,6 +60,7 @@ extern "C" {
     struct llama_model;
     struct llama_context;
     struct llama_sampler;
+    struct llama_kv_cache;
 
     typedef int32_t llama_pos;
     typedef int32_t llama_token;
@@ -467,8 +468,9 @@ extern "C" {
 
     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 
-    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx); // TODO: remove const?
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_cache(      struct llama_context * ctx);
+    LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx);
 
     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
     LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
@@ -583,7 +585,7 @@ extern "C" {
     // KV cache
     //
 
-    // TODO: remove llama_kv_cache_view_* API
+    // TODO: start using struct llama_kv_cache
 
     // Information associated with an individual cell in the KV cache view.
     struct llama_kv_cache_view_cell {
@@ -638,41 +640,47 @@ extern "C" {
 
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_cache_n_tokens(const struct llama_kv_cache * kv);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "use llama_kv_cache_n_tokens instead");
 
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_cache_used_cells(const struct llama_kv_cache * kv);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
+            "use llama_kv_cache_used_cells instead");
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
     LLAMA_API void llama_kv_cache_clear(
-            struct llama_context * ctx);
+            struct llama_kv_cache * kv);
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
     // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
     LLAMA_API bool llama_kv_cache_seq_rm(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id,
+                        llama_pos   p0,
+                        llama_pos   p1);
 
     // Copy all tokens that belong to the specified sequence to another sequence
     // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_cp(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id_src,
-                    llama_seq_id   seq_id_dst,
-                       llama_pos   p0,
-                       llama_pos   p1);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id_src,
+                     llama_seq_id   seq_id_dst,
+                        llama_pos   p0,
+                        llama_pos   p1);
 
     // Removes all tokens that do not belong to the specified sequence
     LLAMA_API void llama_kv_cache_seq_keep(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id);
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
@@ -681,11 +689,11 @@ extern "C" {
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_add(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                       llama_pos   delta);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id,
+                        llama_pos   p0,
+                        llama_pos   p1,
+                        llama_pos   delta);
 
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
@@ -694,31 +702,28 @@ extern "C" {
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id,
+                        llama_pos   p0,
+                        llama_pos   p1,
+                              int   d);
 
     // Returns the largest position present in the KV cache for the specified sequence
     LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
-    //       how to avoid this?
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id);
 
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
     //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-
-    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
+    LLAMA_API void llama_kv_cache_defrag(struct llama_kv_cache * kv);
 
     // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
+    LLAMA_API bool llama_kv_cache_can_shift(const struct llama_kv_cache * kv);
+
+    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
+    LLAMA_API void llama_update_kv_cache(struct llama_context * ctx, struct llama_kv_cache * kv);
 
     //
     // State / sessions
 
@@ -602,11 +602,15 @@ uint32_t llama_n_seq_max(const struct llama_context * ctx) {
     return ctx->kv_self.size;
 }
 
-const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+const llama_model * llama_get_model(const llama_context * ctx) {
     return &ctx->model;
 }
 
-enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
+llama_kv_cache * llama_get_kv_cache(llama_context * ctx) {
+    return &ctx->kv_self;
+}
+
+enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
     return ctx->cparams.pooling_type;
 }
 
@@ -1142,7 +1146,7 @@ struct llama_data_read {
         if (dest_seq_id != -1) {
             // single sequence
 
-            llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
+            kv_self.seq_rm(dest_seq_id, -1, -1);
 
             llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
             batch.n_tokens = cell_count;
@@ -1185,7 +1189,7 @@ struct llama_data_read {
                 return false;
             }
 
-            llama_kv_cache_clear(kv_self);
+            kv_self.clear();
 
             for (uint32_t i = 0; i < cell_count; ++i) {
                 llama_kv_cell & cell = kv_self.cells[i];
@@ -1362,9 +1366,9 @@ struct llama_data_read {
 
         if (!res) {
             if (seq_id == -1) {
-                llama_kv_cache_clear(ctx);
+                ctx->kv_self.clear();
             } else {
-                llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
+                ctx->kv_self.seq_rm(seq_id, -1, -1);
             }
             throw std::runtime_error("failed to restore kv cache");
         }
Original file line number	Diff line number	Diff line change
`@@ -909,7 +909,9 @@ struct common_init_result common_init_from_params(common_params & params) {`
`909`	`909`	`return iparams;`
`910`	`910`	`}`
`911`	`911`
`912`		`- if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {`
	`912`	`+ llama_kv_cache * kv = llama_get_kv_cache(lctx);`
	`913`	`+`
	`914`	`+ if (params.ctx_shift && !llama_kv_cache_can_shift(kv)) {`
`913`	`915`	`LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);`
`914`	`916`	`params.ctx_shift = false;`
`915`	`917`	`}`
`@@ -1014,7 +1016,7 @@ struct common_init_result common_init_from_params(common_params & params) {`
`1014`	`1016`	`if (llama_model_has_decoder(model)) {`
`1015`	`1017`	`llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));`
`1016`	`1018`	`}`
`1017`		`- llama_kv_cache_clear(lctx);`
	`1019`	`+ llama_kv_cache_clear(kv);`
`1018`	`1020`	`llama_synchronize(lctx);`
`1019`	`1021`	`llama_perf_context_reset(lctx);`
`1020`	`1022`	`}`