kv-cache : init -> contructor + add llama_memory_params

ggerganov · ggerganov · commit af309c2ed361 · 2025-04-15T16:49:06.000+03:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -184,39 +184,47 @@ llama_context::llama_context(
         ggml_type type_v = params.type_v;
 
         if (!llama_model_is_recurrent(&model)) {
-            //kv_self.reset(static_cast<llama_kv_cache_unified *>(model.create_memory()));
-            auto * kv = static_cast<llama_kv_cache_unified *>(model.create_memory());
-
             LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
 
-            cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv->get_padding(cparams));
+            cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_unified::get_padding(cparams));
 
             LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
 
             kv_size = cparams.n_ctx;
             type_k = params.type_k;
             type_v = params.type_v;
 
-            kv_self.reset(kv);
-        } else {
-            auto * kv = static_cast<llama_kv_cache_recurrent *>(model.create_memory());
+            llama_memory_params params_mem = {
+                /*.type_k       =*/ type_k,
+                /*.type_v       =*/ type_v,
+                /*.v_trans      =*/ !cparams.flash_attn,
+                /*.offload_kqv  =*/ cparams.offload_kqv,
+                /*.kv_size      =*/ kv_size,
+            };
 
-            LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
+            auto * kv = static_cast<llama_kv_cache_unified *>(model.create_memory(params_mem));
 
+            kv_self.reset(kv);
+        } else {
             // Mamba needs at least as many KV cells as there are sequences kept at any time
             kv_size = std::max((uint32_t) 1, params.n_seq_max);
             // it's probably best to keep as much precision as possible for the states
             type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
             type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
 
-            kv_self.reset(kv);
-        }
+            llama_memory_params params_mem = {
+                /*.type_k       =*/ type_k,
+                /*.type_v       =*/ type_v,
+                /*.v_trans      =*/ false, // unused
+                /*.offload_kqv  =*/ params.offload_kqv,
+                /*.kv_size      =*/ kv_size,
+            };
 
-        GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
-        GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
+            auto * kv = static_cast<llama_kv_cache_recurrent *>(model.create_memory(params_mem));
 
-        if (!kv_self->init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
-            throw std::runtime_error("failed to initialize self-attention cache");
+            LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
+
+            kv_self.reset(kv);
         }
 
         {
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -15,27 +15,22 @@
 // llama_kv_cache_unified
 //
 
-llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {
-}
+llama_kv_cache_unified::llama_kv_cache_unified(
+        const llama_hparams & hparams,
+                  callbacks   cbs,
+                  ggml_type   type_k,
+                  ggml_type   type_v,
+                       bool   v_trans,
+                   uint32_t   kv_size) : hparams(hparams), cbs(std::move(cbs)), v_trans(v_trans) {
 
-bool llama_kv_cache_unified::init(
-        const llama_model & model,
-      const llama_cparams & cparams,
-                ggml_type   type_k,
-                ggml_type   type_v,
-                 uint32_t   kv_size,
-                     bool   offload) {
     const int32_t n_layer = hparams.n_layer;
 
     has_shift = false;
 
-    GGML_ASSERT(!llama_model_is_recurrent(&model));
-
-    v_trans   = !cparams.flash_attn;
     can_shift = true;
 
-    LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
-            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift);
+    LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
+            __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift);
 
     head = 0;
     size = kv_size;
@@ -79,25 +74,11 @@ bool llama_kv_cache_unified::init(
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
-        const char * dev_name = "CPU";
-
-        ggml_backend_buffer_type_t buft;
-        if (offload) {
-            auto * dev = model.dev_layer(i);
-            buft = ggml_backend_dev_buffer_type(dev);
-
-            dev_name = ggml_backend_dev_name(dev);
-        } else {
-            buft = ggml_backend_cpu_buffer_type();
-        }
-
-        LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
-                i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
+        ggml_backend_buffer_type_t buft = cbs.get_buft(i);
 
         ggml_context * ctx = ctx_for_buft(buft);
         if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
-            return false;
+            throw std::runtime_error("failed to create ggml context for kv cache");
         }
 
         ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
@@ -115,15 +96,12 @@ bool llama_kv_cache_unified::init(
 
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
-            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
-            return false;
+            throw std::runtime_error("failed to allocate buffer for kv cache");
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
         bufs.emplace_back(buf);
     }
-
-    return true;
 }
 
 int32_t llama_kv_cache_unified::get_n_tokens() const {
@@ -480,7 +458,7 @@ bool llama_kv_cache_unified::find_slot(
     return true;
 }
 
-uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const {
+uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
     // the FA kernels require padding to avoid extra runtime boundary checks
     return cparams.flash_attn ? 256u : 32u;
 }
@@ -1021,24 +999,16 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
 // llama_kv_cache_recurrent
 //
 
-llama_kv_cache_recurrent::llama_kv_cache_recurrent(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {
-}
-
-bool llama_kv_cache_recurrent::init(
-        const llama_model & model,
-      const llama_cparams & cparams,
-                ggml_type   type_k,
-                ggml_type   type_v,
-                 uint32_t   kv_size,
-                     bool   offload) {
-    GGML_UNUSED(cparams);
-
+llama_kv_cache_recurrent::llama_kv_cache_recurrent(
+        const llama_hparams & hparams,
+                  callbacks   cbs,
+                  ggml_type   type_k,
+                  ggml_type   type_v,
+                   uint32_t   kv_size) : hparams(hparams), cbs(std::move(cbs)) {
     const int32_t n_layer = hparams.n_layer;
 
-    GGML_ASSERT(llama_model_is_recurrent(&model));
-
-    LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
-            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
+    LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
+            __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
 
     head = 0;
     size = kv_size;
@@ -1082,25 +1052,11 @@ bool llama_kv_cache_recurrent::init(
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
-        const char * dev_name = "CPU";
-
-        ggml_backend_buffer_type_t buft;
-        if (offload) {
-            auto * dev = model.dev_layer(i);
-            buft = ggml_backend_dev_buffer_type(dev);
-
-            dev_name = ggml_backend_dev_name(dev);
-        } else {
-            buft = ggml_backend_cpu_buffer_type();
-        }
-
-        LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
-                i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
+        ggml_backend_buffer_type_t buft = cbs.get_buft(i);
 
         ggml_context * ctx = ctx_for_buft(buft);
         if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
-            return false;
+            throw std::runtime_error("failed to create ggml context for kv cache");
         }
 
         ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
@@ -1118,15 +1074,12 @@ bool llama_kv_cache_recurrent::init(
 
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
-            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
-            return false;
+            throw std::runtime_error("failed to allocate buffer for kv cache");
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
         bufs.emplace_back(buf);
     }
-
-    return true;
 }
 
 int32_t llama_kv_cache_recurrent::get_n_tokens() const {
@@ -1558,11 +1511,6 @@ bool llama_kv_cache_recurrent::find_slot(
     return n >= n_seqs;
 }
 
-uint32_t llama_kv_cache_recurrent::get_padding(const llama_cparams & cparams) const {
-    // the FA kernels require padding to avoid extra runtime boundary checks
-    return cparams.flash_attn ? 256u : 32u;
-}
-
 uint32_t llama_kv_cache_recurrent::cell_max() const {
     for (uint32_t i = size; i > 0; --i) {
         const llama_kv_cell & cell = cells[i - 1];
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -15,19 +15,18 @@ struct llama_hparams;
 struct llama_ubatch;
 
 struct llama_kv_cache : public llama_memory_i {
+    // can be used to query data from the model if needed
+    struct callbacks {
+        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
+
+        // get the buffer type of layer il, can be used to offload KV cache layers to a different device
+        std::function<ggml_backend_buffer_type_t (int il)> get_buft;
+    };
+
     virtual ~llama_kv_cache() = default;
 
     using llama_memory_i::llama_memory_i;
 
-    // TODO: become constructor
-    virtual bool init(
-            const llama_model & model,   // TODO: do not reference the model
-          const llama_cparams & cparams,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                     uint32_t   kv_size,
-                         bool   offload) = 0;
-
     virtual void restore() = 0; // call if batch processing fails - restores the cache state
     virtual void commit() = 0;  // call after successful batch processing - clears any pending state
 
@@ -96,23 +95,13 @@ struct llama_kv_cell {
 // TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
-    // can be used to query data from the model if needed
-    struct callbacks {
-        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
-    };
-
-    // TODO: become constructor
-    bool init(
-            const llama_model & model,   // TODO: do not reference the model
-          const llama_cparams & cparams,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                     uint32_t   kv_size,
-                         bool   offload) override;
-
     llama_kv_cache_unified(
             const llama_hparams & hparams,
-            callbacks             cbs);
+                      callbacks   cbs,
+                      ggml_type   type_k,
+                      ggml_type   type_v,
+                           bool   v_trans,
+                       uint32_t   kv_size);
 
     ~llama_kv_cache_unified() = default;
 
@@ -149,8 +138,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
     // to the first cell of the slot.
     bool find_slot(const llama_ubatch & batch) override;
 
-    // TODO: maybe not needed
-    uint32_t get_padding(const llama_cparams & cparams) const;
+    static uint32_t get_padding(const llama_cparams & cparams);
 
     // find how many cells are currently in use
     uint32_t cell_max() const;
@@ -229,26 +217,15 @@ class llama_kv_cache_unified : public llama_kv_cache {
 
 class llama_kv_cache_recurrent : public llama_kv_cache {
 public:
-    // can be used to query data from the model if needed
-    struct callbacks {
-        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
-    };
-
     llama_kv_cache_recurrent(
             const llama_hparams & hparams,
-            callbacks             cbs);
+                      callbacks   cbs,
+                      ggml_type   type_k,
+                      ggml_type   type_v,
+                       uint32_t   kv_size);
 
     ~llama_kv_cache_recurrent() = default;
 
-    // TODO: become constructor
-    bool init(
-            const llama_model & model,   // TODO: do not reference the model
-          const llama_cparams & cparams,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                     uint32_t   kv_size,
-                         bool   offload) override;
-
     int32_t get_n_tokens()   const override;
     int32_t get_used_cells() const override;
 
@@ -282,9 +259,6 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
     // to the first cell of the slot.
     bool find_slot(const llama_ubatch & batch) override;
 
-    // TODO: maybe not needed
-    uint32_t get_padding(const llama_cparams & cparams) const;
-
     // find how many cells are currently in use
     uint32_t cell_max() const;
 
diff --git a/src/llama-memory.h b/src/llama-memory.h
@@ -2,6 +2,20 @@
 
 #include "llama.h"
 
+struct llama_memory_params {
+    // kv cache
+    ggml_type type_k;
+    ggml_type type_v;
+
+    bool v_trans;
+    bool offload_kqv;
+
+    uint32_t kv_size;
+
+    // other types of memory
+    // ...
+};
+
 // general concept of LLM memory
 // the KV cache is a type of LLM memory, but there can be other types
 class llama_memory_i {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-model.h b/src/llama-model.h