Skip to content

Commit c47ead4

Browse files
return maps, merge, non-GPU==Host
1 parent fc10841 commit c47ead4

13 files changed

+89
-98
lines changed

src/llama-context.cpp

Lines changed: 51 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -2030,20 +2030,11 @@ void llama_context::perf_reset() {
20302030

20312031
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
20322032
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
2033-
2034-
for (const auto & backend_ptr : backends) {
2035-
ggml_backend_t backend = backend_ptr.get();
2036-
{ // memory allocated statically on device of the backend itself
2037-
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
2038-
ret[buft] = {model.memory_use(buft), memory->memory_use(buft), 0};
2039-
}
2040-
{ // memory allocated on host for backend
2041-
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(ggml_backend_get_device(backend));
2042-
if (ret.count(buft) != 0) {
2043-
continue; // multiple backends may use the same host buffer type
2044-
}
2045-
ret[buft] = {model.memory_use(buft), memory->memory_use(buft), 0};
2046-
}
2033+
for (const auto & buft_size : model.memory_breakdown()) {
2034+
ret[buft_size.first].model += buft_size.second;
2035+
}
2036+
for (const auto & buft_size : memory->memory_breakdown()) {
2037+
ret[buft_size.first].context += buft_size.second;
20472038
}
20482039
for (const auto & backend_ptr : backends) {
20492040
ggml_backend_t backend = backend_ptr.get();
@@ -2806,74 +2797,70 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
28062797
constexpr size_t MiB = 1024 * 1024;
28072798
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
28082799

2809-
// "free" host memory is poorly defined, instead track only memory that we know is being used:
2810-
llama_memory_breakdown_data mb_host_acc = memory_breakdown[ggml_backend_cpu_buffer_type()];
28112800
// track seen host buffer types to avoid double counting:
2812-
std::set<ggml_backend_buffer_type_t> seen_host_buffer_types = {ggml_backend_cpu_buffer_type()};
2801+
std::set<ggml_backend_buffer_type_t> seen_host_buffer_types;
28132802

2803+
// GPU devices have their own memory, print a breakdown for each GPU on a single line:
28142804
for (const ggml_backend_dev_t & dev : devices) {
2805+
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
2806+
continue;
2807+
}
28152808
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev);
2816-
28172809
const llama_memory_breakdown_data & mb = memory_breakdown[buft];
28182810

2819-
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
2820-
const std::string name = ggml_backend_buft_name(buft);
2821-
std::string desc = ggml_backend_dev_description(dev);
2822-
for (const std::string & prefix : desc_prefixes_strip) {
2823-
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
2824-
desc = desc.substr(prefix.length());
2825-
}
2811+
const std::string name = ggml_backend_buft_name(buft);
2812+
std::string desc = ggml_backend_dev_description(dev);
2813+
for (const std::string & prefix : desc_prefixes_strip) {
2814+
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
2815+
desc = desc.substr(prefix.length());
28262816
}
2827-
2828-
size_t free, total;
2829-
ggml_backend_dev_memory(dev, &free, &total);
2830-
2831-
const size_t self = mb.model + mb.context + mb.compute;
2832-
const size_t unaccounted = total - self - free;
2833-
2834-
table_data.push_back({
2835-
template_gpu,
2836-
" - " + name + " (" + desc + ")",
2837-
std::to_string(total / MiB),
2838-
std::to_string(free / MiB),
2839-
std::to_string(self / MiB),
2840-
std::to_string(mb.model / MiB),
2841-
std::to_string(mb.context / MiB),
2842-
std::to_string(mb.compute / MiB),
2843-
std::to_string(unaccounted / MiB)});
2844-
} else {
2845-
if (seen_host_buffer_types.count(buft) == 1) {
2846-
continue;
2847-
}
2848-
mb_host_acc.model += mb.model;
2849-
mb_host_acc.context += mb.context;
2850-
mb_host_acc.compute += mb.compute;
2851-
seen_host_buffer_types.insert(buft);
28522817
}
28532818

2854-
ggml_backend_buffer_type_t buft_host = ggml_backend_dev_host_buffer_type(dev);
2855-
if (!buft_host) {
2856-
continue;
2857-
}
2858-
if (seen_host_buffer_types.count(buft_host) == 1) {
2819+
size_t free, total;
2820+
ggml_backend_dev_memory(dev, &free, &total);
2821+
2822+
const size_t self = mb.model + mb.context + mb.compute;
2823+
const size_t unaccounted = total - self - free;
2824+
2825+
table_data.push_back({
2826+
template_gpu,
2827+
" - " + name + " (" + desc + ")",
2828+
std::to_string(total / MiB),
2829+
std::to_string(free / MiB),
2830+
std::to_string(self / MiB),
2831+
std::to_string(mb.model / MiB),
2832+
std::to_string(mb.context / MiB),
2833+
std::to_string(mb.compute / MiB),
2834+
std::to_string(unaccounted / MiB)});
2835+
seen_host_buffer_types.insert(buft);
2836+
}
2837+
2838+
// "free" host memory is poorly defined, instead track only memory that we know is being used:
2839+
llama_memory_breakdown_data mb_host = {0, 0, 0};
2840+
2841+
// consolidate all memory buffers not on any of the models GPU devices as host memory:
2842+
for (const auto & buft_mb : memory_breakdown) {
2843+
ggml_backend_buffer_type_t buft = buft_mb.first;
2844+
const llama_memory_breakdown_data & mb = buft_mb.second;
2845+
if (seen_host_buffer_types.count(buft) == 1) {
28592846
continue;
28602847
}
2861-
const llama_memory_breakdown_data & mb_host = memory_breakdown[buft_host];
2862-
mb_host_acc.model += mb_host.model;
2863-
mb_host_acc.context += mb_host.context;
2864-
mb_host_acc.compute += mb_host.compute;
2865-
seen_host_buffer_types.insert(buft_host);
2848+
mb_host.model += mb.model;
2849+
mb_host.context += mb.context;
2850+
mb_host.compute += mb.compute;
2851+
seen_host_buffer_types.insert(buft);
28662852
}
2867-
const size_t self_host = mb_host_acc.model + mb_host_acc.context + mb_host_acc.compute;
2853+
2854+
const size_t self_host = mb_host.model + mb_host.context + mb_host.compute;
28682855
table_data.push_back({
28692856
template_host,
28702857
" - Host",
28712858
"", // total
28722859
"", // free
28732860
std::to_string(self_host / MiB),
2874-
std::to_string(mb_host_acc.model / MiB),
2875-
std::to_string(mb_host_acc.context / MiB),
2876-
std::to_string(mb_host_acc.compute / MiB),
2861+
std::to_string(mb_host.model / MiB),
2862+
std::to_string(mb_host.context / MiB),
2863+
std::to_string(mb_host.compute / MiB),
28772864
""}); // unaccounted
28782865

28792866
for (size_t j = 1; j < table_data[0].size(); j++) {

src/llama-context.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ struct llama_memory_context_i;
2323

2424
// "memory" as in physical memory for a buffer type, in bytes
2525
struct llama_memory_breakdown_data {
26-
size_t model; // memory allocated for the model
27-
size_t context; // memory allocated for the context
28-
size_t compute; // memory allocated for temporary compute buffers
26+
size_t model = 0; // memory allocated for the model
27+
size_t context = 0; // memory allocated for the context
28+
size_t compute = 0; // memory allocated for temporary compute buffers
2929
};
3030

3131
struct llama_context {

src/llama-kv-cache-iswa.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,12 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
113113
return kv_swa->seq_pos_max(seq_id);
114114
}
115115

116-
size_t llama_kv_cache_iswa::memory_use(ggml_backend_buffer_type_t buft) const {
117-
return kv_base->memory_use(buft) + kv_swa->memory_use(buft);
116+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
117+
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
118+
for (const auto & buft_size : kv_swa->memory_breakdown()) {
119+
mb[buft_size.first] += buft_size.second;
120+
}
121+
return mb;
118122
}
119123

120124
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {

src/llama-kv-cache-iswa.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ class llama_kv_cache_iswa : public llama_memory_i {
5656
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
5757
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
5858

59-
size_t memory_use(ggml_backend_buffer_type_t buft) const override;
59+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
6060

6161
// state write/load
6262

src/llama-kv-cache.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -473,15 +473,12 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
473473
return cells.seq_pos_max(seq_id);
474474
}
475475

476-
size_t llama_kv_cache::memory_use(ggml_backend_buffer_type_t buft) const {
477-
size_t n_bytes = 0;
476+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
477+
std::map<ggml_backend_buffer_type_t, size_t> ret;
478478
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
479-
if (ggml_backend_buffer_get_type(buf_ptr.get()) != buft) {
480-
continue;
481-
}
482-
n_bytes += ggml_backend_buffer_get_size(buf_ptr.get());
479+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
483480
}
484-
return n_bytes;
481+
return ret;
485482
}
486483

487484
llama_memory_context_ptr llama_kv_cache::init_batch(

src/llama-kv-cache.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ class llama_kv_cache : public llama_memory_i {
121121
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
122122
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
123123

124-
size_t memory_use(ggml_backend_buffer_type_t buft) const override;
124+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
125125

126126
// state write/load
127127

src/llama-memory-hybrid.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,12 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
166166
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
167167
}
168168

169-
size_t llama_memory_hybrid::memory_use(ggml_backend_buffer_type_t buft) const {
170-
return mem_attn->memory_use(buft) + mem_recr->memory_use(buft);
169+
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
170+
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
171+
for (const auto & buft_size : mem_recr->memory_breakdown()) {
172+
mb[buft_size.first] += buft_size.second;
173+
}
174+
return mb;
171175
}
172176

173177
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {

src/llama-memory-hybrid.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class llama_memory_hybrid : public llama_memory_i {
6868
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
6969
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
7070

71-
size_t memory_use(ggml_backend_buffer_type_t buft) const override;
71+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
7272

7373
// state write/load
7474

src/llama-memory-recurrent.cpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "llama-memory-recurrent.h"
22

3+
#include "ggml-alloc.h"
34
#include "llama-impl.h"
45
#include "llama-io.h"
56
#include "llama-batch.h"
@@ -359,15 +360,12 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
359360
return result;
360361
}
361362

362-
size_t llama_memory_recurrent::memory_use(ggml_backend_buffer_type_t buft) const {
363-
size_t n_bytes = 0;
363+
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
364+
std::map<ggml_backend_buffer_type_t, size_t> ret;
364365
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
365-
if (ggml_backend_buffer_get_type(buf_ptr.get()) != buft) {
366-
continue;
367-
}
368-
n_bytes += ggml_backend_buffer_get_size(buf_ptr.get());
366+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
369367
}
370-
return n_bytes;
368+
return ret;
371369
}
372370

373371
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {

src/llama-memory-recurrent.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "llama-graph.h"
55
#include "llama-memory.h"
66

7+
#include <map>
78
#include <set>
89
#include <vector>
910

@@ -50,7 +51,7 @@ class llama_memory_recurrent : public llama_memory_i {
5051
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
5152
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
5253

53-
size_t memory_use(ggml_backend_buffer_type_t buft) const override;
54+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
5455

5556
bool prepare(const std::vector<llama_ubatch> & ubatches);
5657

0 commit comments

Comments
 (0)