cont : make impl more private

ggerganov · ggerganov · commit 2503fc7f40a2 · 2025-05-11T09:53:31.000+03:00
ggml-ci
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1023,7 +1023,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
 
     auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
 
-    const auto n_kv = kv_self->n;
+    const auto n_kv = kv_self->get_n();
 
     auto & cur = inp->pos_bucket;
 
@@ -1230,7 +1230,7 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
     auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
 
     {
-        const auto n_kv = kv_self->n;
+        const auto n_kv = kv_self->get_n();
 
         inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask, "KQ_mask", -1);
@@ -1242,7 +1242,7 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
     if (hparams.n_swa_pattern > 1) {
         GGML_ASSERT(hparams.n_swa > 0);
 
-        const auto n_kv = kv_self->n;
+        const auto n_kv = kv_self->get_n();
 
         inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -151,7 +151,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
 }
 
 void llama_kv_cache_unified::clear() {
-    for (int32_t i = 0; i < (int32_t) size; ++i) {
+    for (uint32_t i = 0; i < size; ++i) {
         cells[i].pos = -1;
         cells[i].seq_id.clear();
     }
@@ -561,8 +561,8 @@ bool llama_kv_cache_unified::get_can_shift() const {
     return can_shift;
 }
 
-const llama_kv_cache_unified::kv_layer & llama_kv_cache_unified::get_layer(int32_t il) const {
-    return layers[il];
+uint32_t llama_kv_cache_unified::get_n() const {
+    return n;
 }
 
 ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il) const {
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -90,36 +90,6 @@ struct llama_kv_cache_guard {
 // TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
-    // commit/restore cache
-    struct slot_range {
-        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
-        uint32_t c1 = 0;
-    };
-
-    struct kv_cell {
-        llama_pos pos   = -1;
-        llama_pos delta =  0;
-
-        std::set<llama_seq_id> seq_id;
-
-        bool has_seq_id(const llama_seq_id & id) const {
-            return seq_id.find(id) != seq_id.end();
-        }
-
-        bool is_empty() const {
-            return seq_id.empty();
-        }
-
-        bool is_same_seq(const kv_cell & other) const {
-            return seq_id == other.seq_id;
-        }
-    };
-
-    struct kv_layer {
-        ggml_tensor * k = nullptr;
-        ggml_tensor * v = nullptr;
-    };
-
     static uint32_t get_padding(const llama_cparams & cparams);
 
     llama_kv_cache_unified(
@@ -133,16 +103,6 @@ class llama_kv_cache_unified : public llama_kv_cache {
 
     ~llama_kv_cache_unified() = default;
 
-    // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_impl also uses it, so it
-    // cannot be freely changed after a slot has been allocated.
-    uint32_t head = 0;
-    uint32_t size = 0;
-    uint32_t used = 0; // used cells (i.e. at least one seq_id)
-
-    // computed before each graph build
-    uint32_t n = 0;
-
     //
     // llama_memory_i
     //
@@ -187,7 +147,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
 
     bool get_can_shift() const override;
 
-    const kv_layer & get_layer(int32_t il) const;
+    uint32_t get_n() const;
 
     ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
     ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
@@ -210,12 +170,52 @@ class llama_kv_cache_unified : public llama_kv_cache {
     const llama_model & model;
     const llama_hparams & hparams;
 
+    // commit/restore cache
+    struct slot_range {
+        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
+        uint32_t c1 = 0;
+    };
+
+    struct kv_cell {
+        llama_pos pos   = -1;
+        llama_pos delta =  0;
+
+        std::set<llama_seq_id> seq_id;
+
+        bool has_seq_id(const llama_seq_id & id) const {
+            return seq_id.find(id) != seq_id.end();
+        }
+
+        bool is_empty() const {
+            return seq_id.empty();
+        }
+
+        bool is_same_seq(const kv_cell & other) const {
+            return seq_id == other.seq_id;
+        }
+    };
+
+    struct kv_layer {
+        ggml_tensor * k = nullptr;
+        ggml_tensor * v = nullptr;
+    };
+
     bool has_shift = false;
     bool do_defrag = false;
 
     bool v_trans   = true;  // the value tensor is transposed
     bool can_shift = false;
 
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_impl also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+    uint32_t used = 0; // used cells (i.e. at least one seq_id)
+
+    // computed before each graph build
+    uint32_t n = 0;
+
     // required padding
     uint32_t padding = 1;
 
@@ -279,9 +279,9 @@ class llama_kv_cache_unified : public llama_kv_cache {
 // llama_kv_cache_unified_swa
 //
 
-//class llama_kv_cache_unified_swa : public llama_kv_cache {
-//public:
-//};
+class llama_kv_cache_unified_swa : public llama_kv_cache {
+public:
+};
 
 //
 // llama_kv_cache_recurrent

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(`
`151`	`151`	`}`
`152`	`152`
`153`	`153`	`void llama_kv_cache_unified::clear() {`
`154`		`- for (int32_t i = 0; i < (int32_t) size; ++i) {`
	`154`	`+ for (uint32_t i = 0; i < size; ++i) {`
`155`	`155`	`cells[i].pos = -1;`
`156`	`156`	`cells[i].seq_id.clear();`
`157`	`157`	`}`
`@@ -561,8 +561,8 @@ bool llama_kv_cache_unified::get_can_shift() const {`
`561`	`561`	`return can_shift;`
`562`	`562`	`}`
`563`	`563`
`564`		`-const llama_kv_cache_unified::kv_layer & llama_kv_cache_unified::get_layer(int32_t il) const {`
`565`		`- return layers[il];`
	`564`	`+uint32_t llama_kv_cache_unified::get_n() const {`
	`565`	`+ return n;`
`566`	`566`	`}`
`567`	`567`
`568`	`568`	`ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il) const {`