return maps, merge, non-GPU==Host

JohannesGaessler · JohannesGaessler · commit c47ead460d67 · 2025-09-18T22:13:16.000+02:00
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -2030,20 +2030,11 @@ void llama_context::perf_reset() {
 
 std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
     std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
-
-    for (const auto & backend_ptr : backends) {
-        ggml_backend_t backend = backend_ptr.get();
-        { // memory allocated statically on device of the backend itself
-            ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
-            ret[buft] = {model.memory_use(buft), memory->memory_use(buft), 0};
-        }
-        { // memory allocated on host for backend
-            ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(ggml_backend_get_device(backend));
-            if (ret.count(buft) != 0) {
-                continue; // multiple backends may use the same host buffer type
-            }
-            ret[buft] = {model.memory_use(buft), memory->memory_use(buft), 0};
-        }
+    for (const auto & buft_size : model.memory_breakdown()) {
+        ret[buft_size.first].model += buft_size.second;
+    }
+    for (const auto & buft_size : memory->memory_breakdown()) {
+        ret[buft_size.first].context += buft_size.second;
     }
     for (const auto & backend_ptr : backends) {
         ggml_backend_t backend = backend_ptr.get();
@@ -2806,74 +2797,70 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
     constexpr size_t MiB = 1024 * 1024;
     const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
 
-    // "free" host memory is poorly defined, instead track only memory that we know is being used:
-    llama_memory_breakdown_data mb_host_acc = memory_breakdown[ggml_backend_cpu_buffer_type()];
     // track seen host buffer types to avoid double counting:
-    std::set<ggml_backend_buffer_type_t> seen_host_buffer_types = {ggml_backend_cpu_buffer_type()};
+    std::set<ggml_backend_buffer_type_t> seen_host_buffer_types;
 
+    // GPU devices have their own memory, print a breakdown for each GPU on a single line:
     for (const ggml_backend_dev_t & dev : devices) {
+        if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
+            continue;
+        }
         ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev);
-
         const llama_memory_breakdown_data & mb = memory_breakdown[buft];
 
-        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
-            const std::string name = ggml_backend_buft_name(buft);
-            std::string desc = ggml_backend_dev_description(dev);
-            for (const std::string & prefix : desc_prefixes_strip) {
-                if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
-                    desc = desc.substr(prefix.length());
-                }
+        const std::string name = ggml_backend_buft_name(buft);
+        std::string desc = ggml_backend_dev_description(dev);
+        for (const std::string & prefix : desc_prefixes_strip) {
+            if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
+                desc = desc.substr(prefix.length());
             }
-
-            size_t free, total;
-            ggml_backend_dev_memory(dev, &free, &total);
-
-            const size_t self = mb.model + mb.context + mb.compute;
-            const size_t unaccounted = total - self - free;
-
-            table_data.push_back({
-                template_gpu,
-                "  - " + name + " (" + desc + ")",
-                std::to_string(total / MiB),
-                std::to_string(free / MiB),
-                std::to_string(self / MiB),
-                std::to_string(mb.model / MiB),
-                std::to_string(mb.context / MiB),
-                std::to_string(mb.compute / MiB),
-                std::to_string(unaccounted / MiB)});
-        } else {
-            if (seen_host_buffer_types.count(buft) == 1) {
-                continue;
-            }
-            mb_host_acc.model   += mb.model;
-            mb_host_acc.context += mb.context;
-            mb_host_acc.compute += mb.compute;
-            seen_host_buffer_types.insert(buft);
         }
 
-        ggml_backend_buffer_type_t buft_host = ggml_backend_dev_host_buffer_type(dev);
-        if (!buft_host) {
-            continue;
-        }
-        if (seen_host_buffer_types.count(buft_host) == 1) {
+        size_t free, total;
+        ggml_backend_dev_memory(dev, &free, &total);
+
+        const size_t self = mb.model + mb.context + mb.compute;
+        const size_t unaccounted = total - self - free;
+
+        table_data.push_back({
+            template_gpu,
+            "  - " + name + " (" + desc + ")",
+            std::to_string(total / MiB),
+            std::to_string(free / MiB),
+            std::to_string(self / MiB),
+            std::to_string(mb.model / MiB),
+            std::to_string(mb.context / MiB),
+            std::to_string(mb.compute / MiB),
+            std::to_string(unaccounted / MiB)});
+        seen_host_buffer_types.insert(buft);
+    }
+
+    // "free" host memory is poorly defined, instead track only memory that we know is being used:
+    llama_memory_breakdown_data mb_host = {0, 0, 0};
+
+    // consolidate all memory buffers not on any of the models GPU devices as host memory:
+    for (const auto & buft_mb : memory_breakdown) {
+        ggml_backend_buffer_type_t          buft = buft_mb.first;
+        const llama_memory_breakdown_data & mb   = buft_mb.second;
+        if (seen_host_buffer_types.count(buft) == 1) {
             continue;
         }
-        const llama_memory_breakdown_data & mb_host = memory_breakdown[buft_host];
-        mb_host_acc.model   += mb_host.model;
-        mb_host_acc.context += mb_host.context;
-        mb_host_acc.compute += mb_host.compute;
-        seen_host_buffer_types.insert(buft_host);
+        mb_host.model   += mb.model;
+        mb_host.context += mb.context;
+        mb_host.compute += mb.compute;
+        seen_host_buffer_types.insert(buft);
     }
-    const size_t self_host = mb_host_acc.model + mb_host_acc.context + mb_host_acc.compute;
+
+    const size_t self_host = mb_host.model + mb_host.context + mb_host.compute;
     table_data.push_back({
         template_host,
         "  - Host",
         "", // total
         "", // free
         std::to_string(self_host / MiB),
-        std::to_string(mb_host_acc.model / MiB),
-        std::to_string(mb_host_acc.context / MiB),
-        std::to_string(mb_host_acc.compute / MiB),
+        std::to_string(mb_host.model / MiB),
+        std::to_string(mb_host.context / MiB),
+        std::to_string(mb_host.compute / MiB),
         ""}); // unaccounted
 
     for (size_t j = 1; j < table_data[0].size(); j++) {
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -23,9 +23,9 @@ struct llama_memory_context_i;
 
 // "memory" as in physical memory for a buffer type, in bytes
 struct llama_memory_breakdown_data {
-    size_t model;   // memory allocated for the model
-    size_t context; // memory allocated for the context
-    size_t compute; // memory allocated for temporary compute buffers
+    size_t model   = 0; // memory allocated for the model
+    size_t context = 0; // memory allocated for the context
+    size_t compute = 0; // memory allocated for temporary compute buffers
 };
 
 struct llama_context {
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
@@ -113,8 +113,12 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
     return kv_swa->seq_pos_max(seq_id);
 }
 
-size_t llama_kv_cache_iswa::memory_use(ggml_backend_buffer_type_t buft) const {
-    return kv_base->memory_use(buft) + kv_swa->memory_use(buft);
+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
+    for (const auto & buft_size : kv_swa->memory_breakdown()) {
+        mb[buft_size.first] += buft_size.second;
+    }
+    return mb;
 }
 
 llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
diff --git a/src/llama-kv-cache-iswa.h b/src/llama-kv-cache-iswa.h
@@ -56,7 +56,7 @@ class llama_kv_cache_iswa : public llama_memory_i {
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
 
-    size_t memory_use(ggml_backend_buffer_type_t buft) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
 
     // state write/load
 
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -473,15 +473,12 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
     return cells.seq_pos_max(seq_id);
 }
 
-size_t llama_kv_cache::memory_use(ggml_backend_buffer_type_t buft) const {
-    size_t n_bytes = 0;
+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
     for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
-        if (ggml_backend_buffer_get_type(buf_ptr.get()) != buft) {
-            continue;
-        }
-        n_bytes += ggml_backend_buffer_get_size(buf_ptr.get());
+        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
     }
-    return n_bytes;
+    return ret;
 }
 
 llama_memory_context_ptr llama_kv_cache::init_batch(
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -121,7 +121,7 @@ class llama_kv_cache : public llama_memory_i {
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
 
-    size_t memory_use(ggml_backend_buffer_type_t buft) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
 
     // state write/load
 
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp
@@ -166,8 +166,12 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
     return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
 }
 
-size_t llama_memory_hybrid::memory_use(ggml_backend_buffer_type_t buft) const {
-    return mem_attn->memory_use(buft) + mem_recr->memory_use(buft);
+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
+    for (const auto & buft_size : mem_recr->memory_breakdown()) {
+        mb[buft_size.first] += buft_size.second;
+    }
+    return mb;
 }
 
 void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h
@@ -68,7 +68,7 @@ class llama_memory_hybrid : public llama_memory_i {
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
 
-    size_t memory_use(ggml_backend_buffer_type_t buft) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
 
     // state write/load
 
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
@@ -1,5 +1,6 @@
 #include "llama-memory-recurrent.h"
 
+#include "ggml-alloc.h"
 #include "llama-impl.h"
 #include "llama-io.h"
 #include "llama-batch.h"
@@ -359,15 +360,12 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
     return result;
 }
 
-size_t llama_memory_recurrent::memory_use(ggml_backend_buffer_type_t buft) const {
-    size_t n_bytes = 0;
+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
     for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
-        if (ggml_backend_buffer_get_type(buf_ptr.get()) != buft) {
-            continue;
-        }
-        n_bytes += ggml_backend_buffer_get_size(buf_ptr.get());
+        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
     }
-    return n_bytes;
+    return ret;
 }
 
 llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h
@@ -4,6 +4,7 @@
 #include "llama-graph.h"
 #include "llama-memory.h"
 
+#include <map>
 #include <set>
 #include <vector>
 
@@ -50,7 +51,7 @@ class llama_memory_recurrent : public llama_memory_i {
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
 
-    size_t memory_use(ggml_backend_buffer_type_t buft) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
 
     bool prepare(const std::vector<llama_ubatch> & ubatches);
 
diff --git a/src/llama-memory.h b/src/llama-memory.h
@@ -2,6 +2,7 @@
 
 #include "llama.h"
 
+#include <map>
 #include <memory>
 #include <functional>
 
@@ -108,7 +109,7 @@ struct llama_memory_i {
     virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
     virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
 
-    virtual size_t memory_use(ggml_backend_buffer_type_t buft) const = 0;
+    virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
 
     //
     // state write/read
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -5903,15 +5903,12 @@ size_t llama_model::n_devices() const {
     return devices.size();
 }
 
-size_t llama_model::memory_use(ggml_backend_buffer_type_t buft) const {
-    size_t n_bytes = 0;
+std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
     for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
-        if (ggml_backend_buffer_get_type(buf_ptr.get()) != buft) {
-            continue;
-        }
-        n_bytes += ggml_backend_buffer_get_size(buf_ptr.get());
+        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
     }
-    return n_bytes;
+    return ret;
 }
 
 uint64_t llama_model::n_elements() const {
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -7,6 +7,7 @@
 #include "llama-memory.h"
 #include "llama-vocab.h"
 
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -452,7 +453,8 @@ struct llama_model {
     size_t size() const; // file size
     size_t n_tensors() const;
     size_t n_devices() const;
-    size_t memory_use(ggml_backend_buffer_type_t buft) const;
+
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
 
     // total number of parameters in the model
     uint64_t n_elements() const;

Original file line number	Diff line number	Diff line change
`@@ -113,8 +113,12 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {`
`113`	`113`	`return kv_swa->seq_pos_max(seq_id);`
`114`	`114`	`}`
`115`	`115`
`116`		`-size_t llama_kv_cache_iswa::memory_use(ggml_backend_buffer_type_t buft) const {`
`117`		`- return kv_base->memory_use(buft) + kv_swa->memory_use(buft);`
	`116`	`+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {`
	`117`	`+ std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();`
	`118`	`+ for (const auto & buft_size : kv_swa->memory_breakdown()) {`
	`119`	`+ mb[buft_size.first] += buft_size.second;`
	`120`	`+ }`
	`121`	`+ return mb;`
`118`	`122`	`}`
`119`	`123`
`120`	`124`	`llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {`
Original file line number	Diff line number	Diff line change
`@@ -473,15 +473,12 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {`
`473`	`473`	`return cells.seq_pos_max(seq_id);`
`474`	`474`	`}`
`475`	`475`
`476`		`-size_t llama_kv_cache::memory_use(ggml_backend_buffer_type_t buft) const {`
`477`		`- size_t n_bytes = 0;`
	`476`	`+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {`
	`477`	`+ std::map<ggml_backend_buffer_type_t, size_t> ret;`
`478`	`478`	`for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {`
`479`		`- if (ggml_backend_buffer_get_type(buf_ptr.get()) != buft) {`
`480`		`- continue;`
`481`		`- }`
`482`		`- n_bytes += ggml_backend_buffer_get_size(buf_ptr.get());`
	`479`	`+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());`
`483`	`480`	`}`
`484`		`- return n_bytes;`
	`481`	`+ return ret;`
`485`	`482`	`}`
`486`	`483`
`487`	`484`	`llama_memory_context_ptr llama_kv_cache::init_batch(`
Original file line number	Diff line number	Diff line change
`@@ -166,8 +166,12 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {`
`166`	`166`	`return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));`
`167`	`167`	`}`
`168`	`168`
`169`		`-size_t llama_memory_hybrid::memory_use(ggml_backend_buffer_type_t buft) const {`
`170`		`- return mem_attn->memory_use(buft) + mem_recr->memory_use(buft);`
	`169`	`+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {`
	`170`	`+ std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();`
	`171`	`+ for (const auto & buft_size : mem_recr->memory_breakdown()) {`
	`172`	`+ mb[buft_size.first] += buft_size.second;`
	`173`	`+ }`
	`174`	`+ return mb;`
`171`	`175`	`}`
`172`	`176`
`173`	`177`	`void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {`