llama: consistent ctx <-> buf order for KV cache

JohannesGaessler · JohannesGaessler · commit 7c179ab00f0f · 2025-10-24T14:17:38.000+02:00
diff --git a/ggml/include/ggml-cpp.h b/ggml/include/ggml-cpp.h
@@ -8,6 +8,8 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "gguf.h"
+
+#include <map>
 #include <memory>
 
 // Smart pointers for ggml types
diff --git a/src/llama-impl.h b/src/llama-impl.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "ggml.h" // for ggml_log_level
+#include "ggml-cpp.h"
 
 #include <string>
 #include <vector>
@@ -61,3 +62,11 @@ std::string llama_format_tensor_shape(const struct ggml_tensor * t);
 std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
 
 #define LLAMA_TENSOR_NAME_FATTN "__fattn__"
+
+// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
+struct ggml_backend_buft_comparator {
+    bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+        return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
+    }
+};
+typedef std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> buft_ctx_map_t;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -38,7 +38,7 @@ llama_kv_cache::llama_kv_cache(
     const uint32_t n_layer_kv = hparams.n_layer_kv();
 
     // create a context for each buffer type
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    buft_ctx_map_t ctx_map;
     auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
@@ -53,13 +53,12 @@ llama_kv_cache::llama_kv_cache(
                 return nullptr;
             }
 
-            ctx_map[buft] = ctx;
-            ctxs.emplace_back(ctx);
+            ctx_map.emplace(buft, ctx);
 
             return ctx;
         }
 
-        return it->second;
+        return it->second.get();
     };
 
     GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max);
@@ -167,19 +166,16 @@ llama_kv_cache::llama_kv_cache(
     }
 
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
-    for (auto it : ctx_map) {
-        auto * buft = it.first;
-        auto * ctx  = it.second;
-
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+    for (auto & [buft, ctx] : ctx_map) {
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
         if (!buf) {
             throw std::runtime_error("failed to allocate buffer for kv cache");
         }
 
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
 
         ggml_backend_buffer_clear(buf, 0);
-        bufs.emplace_back(buf);
+        ctxs_bufs.emplace_back(std::move(ctx), buf);
     }
 
     {
@@ -203,7 +199,7 @@ void llama_kv_cache::clear(bool data) {
     }
 
     if (data) {
-        for (auto & buf : bufs) {
+        for (auto & [_, buf] : ctxs_bufs) {
             ggml_backend_buffer_clear(buf.get(), 0);
         }
     }
@@ -472,8 +468,8 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
 
 std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
     std::map<ggml_backend_buffer_type_t, size_t> ret;
-    for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
-        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
+    for (const auto & [_, buf] : ctxs_bufs) {
+        ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
     }
     return ret;
 }
@@ -1298,7 +1294,7 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
 size_t llama_kv_cache::total_size() const {
     size_t size = 0;
 
-    for (const auto & buf : bufs) {
+    for (const auto & [_, buf] : ctxs_bufs) {
         size += ggml_backend_buffer_get_size(buf.get());
     }
 
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -217,8 +217,8 @@ class llama_kv_cache : public llama_memory_i {
     // this is the SWA type of the cache - not to be confused with the model SWA type
     const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
 
-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
+    // ggml contexts for the KV cache along with the allocated backend buffers:
+    std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
 
     // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
     // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
@@ -33,7 +33,7 @@ llama_memory_recurrent::llama_memory_recurrent(
     cells.resize(mem_size);
 
     // create a context for each buffer type
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    buft_ctx_map_t ctx_map;
     auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
@@ -48,13 +48,12 @@ llama_memory_recurrent::llama_memory_recurrent(
                 return nullptr;
             }
 
-            ctx_map[buft] = ctx;
-            ctxs.emplace_back(ctx);
+            ctx_map.emplace(buft, ctx);
 
             return ctx;
         }
 
-        return it->second;
+        return it->second.get();
     };
 
     r_l.resize(n_layer);
@@ -93,17 +92,14 @@ llama_memory_recurrent::llama_memory_recurrent(
     }
 
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
-    for (auto it : ctx_map) {
-        auto * buft = it.first;
-        auto * ctx  = it.second;
-
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+    for (auto & [buft, ctx] : ctx_map) {
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
         if (!buf) {
             throw std::runtime_error("failed to allocate buffer for rs cache");
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-        bufs.emplace_back(buf);
+        ctxs_bufs.emplace_back(std::move(ctx), buf);
     }
 
     {
@@ -129,7 +125,7 @@ void llama_memory_recurrent::clear(bool data) {
     used = 0;
 
     if (data) {
-        for (auto & buf : bufs) {
+        for (auto & [_, buf] : ctxs_bufs) {
             ggml_backend_buffer_clear(buf.get(), 0);
         }
     }
@@ -364,8 +360,8 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
 
 std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
     std::map<ggml_backend_buffer_type_t, size_t> ret;
-    for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
-        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
+    for (const auto & [_, buf] : ctxs_bufs) {
+        ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
     }
     return ret;
 }
@@ -662,7 +658,7 @@ bool llama_memory_recurrent::get_can_shift() const {
 
 size_t llama_memory_recurrent::total_size() const {
     size_t size = 0;
-    for (const auto & buf : bufs) {
+    for (const auto & [_, buf] : ctxs_bufs) {
         size += ggml_backend_buffer_get_size(buf.get());
     }
 
diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h
@@ -109,8 +109,8 @@ class llama_memory_recurrent : public llama_memory_i {
 
     const uint32_t n_seq_max = 1;
 
-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
+    // ggml contexts for the KV cache along with the allocated backend buffers:
+    std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
 
     size_t total_size() const;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -2229,13 +2229,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     max_n_tensors += n_layer*2; // duplicated rope freq tensors
     const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
 
-    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
-    struct ggml_backend_buft_comparator {
-        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
-            return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
-        }
-    };
-    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+    buft_ctx_map_t ctx_map;
 
     auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
         auto it = ctx_map.find(buft);

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ llama_kv_cache::llama_kv_cache(`
`38`	`38`	`const uint32_t n_layer_kv = hparams.n_layer_kv();`
`39`	`39`
`40`	`40`	`// create a context for each buffer type`
`41`		`- std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;`
	`41`	`+ buft_ctx_map_t ctx_map;`
`42`	`42`	`auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {`
`43`	`43`	`auto it = ctx_map.find(buft);`
`44`	`44`	`if (it == ctx_map.end()) {`
`@@ -53,13 +53,12 @@ llama_kv_cache::llama_kv_cache(`
`53`	`53`	`return nullptr;`
`54`	`54`	`}`
`55`	`55`
`56`		`- ctx_map[buft] = ctx;`
`57`		`- ctxs.emplace_back(ctx);`
	`56`	`+ ctx_map.emplace(buft, ctx);`
`58`	`57`
`59`	`58`	`return ctx;`
`60`	`59`	`}`
`61`	`60`
`62`		`- return it->second;`
	`61`	`+ return it->second.get();`
`63`	`62`	`};`
`64`	`63`
`65`	`64`	`GGML_ASSERT(n_stream == 1 \|\| n_stream == n_seq_max);`
`@@ -167,19 +166,16 @@ llama_kv_cache::llama_kv_cache(`
`167`	`166`	`}`
`168`	`167`
`169`	`168`	`// allocate tensors and initialize the buffers to avoid NaNs in the padding`
`170`		`- for (auto it : ctx_map) {`
`171`		`- auto * buft = it.first;`
`172`		`- auto * ctx = it.second;`
`173`		`-`
`174`		`- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);`
	`169`	`+ for (auto & [buft, ctx] : ctx_map) {`
	`170`	`+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);`
`175`	`171`	`if (!buf) {`
`176`	`172`	`throw std::runtime_error("failed to allocate buffer for kv cache");`
`177`	`173`	`}`
`178`	`174`
`179`	`175`	`LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);`
`180`	`176`
`181`	`177`	`ggml_backend_buffer_clear(buf, 0);`
`182`		`- bufs.emplace_back(buf);`
	`178`	`+ ctxs_bufs.emplace_back(std::move(ctx), buf);`
`183`	`179`	`}`
`184`	`180`
`185`	`181`	`{`
`@@ -203,7 +199,7 @@ void llama_kv_cache::clear(bool data) {`
`203`	`199`	`}`
`204`	`200`
`205`	`201`	`if (data) {`
`206`		`- for (auto & buf : bufs) {`
	`202`	`+ for (auto & [_, buf] : ctxs_bufs) {`
`207`	`203`	`ggml_backend_buffer_clear(buf.get(), 0);`
`208`	`204`	`}`
`209`	`205`	`}`
`@@ -472,8 +468,8 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {`
`472`	`468`
`473`	`469`	`std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {`
`474`	`470`	`std::map<ggml_backend_buffer_type_t, size_t> ret;`
`475`		`- for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {`
`476`		`- ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());`
	`471`	`+ for (const auto & [_, buf] : ctxs_bufs) {`
	`472`	`+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());`
`477`	`473`	`}`
`478`	`474`	`return ret;`
`479`	`475`	`}`
`@@ -1298,7 +1294,7 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch`
`1298`	`1294`	`size_t llama_kv_cache::total_size() const {`
`1299`	`1295`	`size_t size = 0;`
`1300`	`1296`
`1301`		`- for (const auto & buf : bufs) {`
	`1297`	`+ for (const auto & [_, buf] : ctxs_bufs) {`
`1302`	`1298`	`size += ggml_backend_buffer_get_size(buf.get());`
`1303`	`1299`	`}`
`1304`	`1300`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ llama_memory_recurrent::llama_memory_recurrent(`
`33`	`33`	`cells.resize(mem_size);`
`34`	`34`
`35`	`35`	`// create a context for each buffer type`
`36`		`- std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;`
	`36`	`+ buft_ctx_map_t ctx_map;`
`37`	`37`	`auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {`
`38`	`38`	`auto it = ctx_map.find(buft);`
`39`	`39`	`if (it == ctx_map.end()) {`
`@@ -48,13 +48,12 @@ llama_memory_recurrent::llama_memory_recurrent(`
`48`	`48`	`return nullptr;`
`49`	`49`	`}`
`50`	`50`
`51`		`- ctx_map[buft] = ctx;`
`52`		`- ctxs.emplace_back(ctx);`
	`51`	`+ ctx_map.emplace(buft, ctx);`
`53`	`52`
`54`	`53`	`return ctx;`
`55`	`54`	`}`
`56`	`55`
`57`		`- return it->second;`
	`56`	`+ return it->second.get();`
`58`	`57`	`};`
`59`	`58`
`60`	`59`	`r_l.resize(n_layer);`
`@@ -93,17 +92,14 @@ llama_memory_recurrent::llama_memory_recurrent(`
`93`	`92`	`}`
`94`	`93`
`95`	`94`	`// allocate tensors and initialize the buffers to avoid NaNs in the padding`
`96`		`- for (auto it : ctx_map) {`
`97`		`- auto * buft = it.first;`
`98`		`- auto * ctx = it.second;`
`99`		`-`
`100`		`- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);`
	`95`	`+ for (auto & [buft, ctx] : ctx_map) {`
	`96`	`+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);`
`101`	`97`	`if (!buf) {`
`102`	`98`	`throw std::runtime_error("failed to allocate buffer for rs cache");`
`103`	`99`	`}`
`104`	`100`	`ggml_backend_buffer_clear(buf, 0);`
`105`	`101`	`LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);`
`106`		`- bufs.emplace_back(buf);`
	`102`	`+ ctxs_bufs.emplace_back(std::move(ctx), buf);`
`107`	`103`	`}`
`108`	`104`
`109`	`105`	`{`
`@@ -129,7 +125,7 @@ void llama_memory_recurrent::clear(bool data) {`
`129`	`125`	`used = 0;`
`130`	`126`
`131`	`127`	`if (data) {`
`132`		`- for (auto & buf : bufs) {`
	`128`	`+ for (auto & [_, buf] : ctxs_bufs) {`
`133`	`129`	`ggml_backend_buffer_clear(buf.get(), 0);`
`134`	`130`	`}`
`135`	`131`	`}`
`@@ -364,8 +360,8 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {`
`364`	`360`
`365`	`361`	`std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {`
`366`	`362`	`std::map<ggml_backend_buffer_type_t, size_t> ret;`
`367`		`- for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {`
`368`		`- ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());`
	`363`	`+ for (const auto & [_, buf] : ctxs_bufs) {`
	`364`	`+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());`
`369`	`365`	`}`
`370`	`366`	`return ret;`
`371`	`367`	`}`
`@@ -662,7 +658,7 @@ bool llama_memory_recurrent::get_can_shift() const {`
`662`	`658`
`663`	`659`	`size_t llama_memory_recurrent::total_size() const {`
`664`	`660`	`size_t size = 0;`
`665`		`- for (const auto & buf : bufs) {`
	`661`	`+ for (const auto & [_, buf] : ctxs_bufs) {`
`666`	`662`	`size += ggml_backend_buffer_get_size(buf.get());`
`667`	`663`	`}`
`668`	`664`