@@ -90,36 +90,6 @@ struct llama_kv_cache_guard {
9090// TODO: add notion of max sequences
9191class llama_kv_cache_unified : public llama_kv_cache {
9292public:
93- // commit/restore cache
94- struct slot_range {
95- uint32_t c0 = 0 ; // note: these are cell indices, not sequence positions
96- uint32_t c1 = 0 ;
97- };
98-
99- struct kv_cell {
100- llama_pos pos = -1 ;
101- llama_pos delta = 0 ;
102-
103- std::set<llama_seq_id> seq_id;
104-
105- bool has_seq_id (const llama_seq_id & id) const {
106- return seq_id.find (id) != seq_id.end ();
107- }
108-
109- bool is_empty () const {
110- return seq_id.empty ();
111- }
112-
113- bool is_same_seq (const kv_cell & other) const {
114- return seq_id == other.seq_id ;
115- }
116- };
117-
118- struct kv_layer {
119- ggml_tensor * k = nullptr ;
120- ggml_tensor * v = nullptr ;
121- };
122-
12393 static uint32_t get_padding (const llama_cparams & cparams);
12494
12595 llama_kv_cache_unified (
@@ -133,16 +103,6 @@ class llama_kv_cache_unified : public llama_kv_cache {
133103
134104 ~llama_kv_cache_unified () = default ;
135105
136- // Note: The value of head isn't only used to optimize searching
137- // for a free KV slot. llama_decode_impl also uses it, so it
138- // cannot be freely changed after a slot has been allocated.
139- uint32_t head = 0 ;
140- uint32_t size = 0 ;
141- uint32_t used = 0 ; // used cells (i.e. at least one seq_id)
142-
143- // computed before each graph build
144- uint32_t n = 0 ;
145-
146106 //
147107 // llama_memory_i
148108 //
@@ -187,7 +147,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
187147
188148 bool get_can_shift () const override ;
189149
190- const kv_layer & get_layer ( int32_t il ) const ;
150+ uint32_t get_n ( ) const ;
191151
192152 ggml_tensor * get_k (ggml_context * ctx, int32_t il) const ;
193153 ggml_tensor * get_v (ggml_context * ctx, int32_t il) const ;
@@ -210,12 +170,52 @@ class llama_kv_cache_unified : public llama_kv_cache {
210170 const llama_model & model;
211171 const llama_hparams & hparams;
212172
173+ // commit/restore cache
174+ struct slot_range {
175+ uint32_t c0 = 0 ; // note: these are cell indices, not sequence positions
176+ uint32_t c1 = 0 ;
177+ };
178+
179+ struct kv_cell {
180+ llama_pos pos = -1 ;
181+ llama_pos delta = 0 ;
182+
183+ std::set<llama_seq_id> seq_id;
184+
185+ bool has_seq_id (const llama_seq_id & id) const {
186+ return seq_id.find (id) != seq_id.end ();
187+ }
188+
189+ bool is_empty () const {
190+ return seq_id.empty ();
191+ }
192+
193+ bool is_same_seq (const kv_cell & other) const {
194+ return seq_id == other.seq_id ;
195+ }
196+ };
197+
198+ struct kv_layer {
199+ ggml_tensor * k = nullptr ;
200+ ggml_tensor * v = nullptr ;
201+ };
202+
213203 bool has_shift = false ;
214204 bool do_defrag = false ;
215205
216206 bool v_trans = true ; // the value tensor is transposed
217207 bool can_shift = false ;
218208
209+ // Note: The value of head isn't only used to optimize searching
210+ // for a free KV slot. llama_decode_impl also uses it, so it
211+ // cannot be freely changed after a slot has been allocated.
212+ uint32_t head = 0 ;
213+ uint32_t size = 0 ;
214+ uint32_t used = 0 ; // used cells (i.e. at least one seq_id)
215+
216+ // computed before each graph build
217+ uint32_t n = 0 ;
218+
219219 // required padding
220220 uint32_t padding = 1 ;
221221
@@ -279,9 +279,9 @@ class llama_kv_cache_unified : public llama_kv_cache {
279279// llama_kv_cache_unified_swa
280280//
281281
282- // class llama_kv_cache_unified_swa : public llama_kv_cache {
283- // public:
284- // };
282+ class llama_kv_cache_unified_swa : public llama_kv_cache {
283+ public:
284+ };
285285
286286//
287287// llama_kv_cache_recurrent
0 commit comments