Skip to content

Commit e789095

Browse files
llama: print memory breakdown on exit (ggml-org#15860)
* llama: print memory breakdown on exit
1 parent f2a789e commit e789095

18 files changed

+244
-13
lines changed

common/sampling.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
332332
}
333333
if (ctx) {
334334
llama_perf_context_print(ctx);
335+
llama_memory_breakdown_print(ctx);
335336
}
336337
}
337338

ggml/include/ggml-backend.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,8 @@ extern "C" {
314314
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
315315
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
316316

317-
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
317+
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
318+
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
318319

319320
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
320321
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

ggml/src/ggml-backend.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1793,6 +1793,14 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
17931793
return sched->backends[i];
17941794
}
17951795

1796+
ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
1797+
GGML_ASSERT(sched);
1798+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
1799+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1800+
1801+
return sched->bufts[backend_index];
1802+
}
1803+
17961804
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
17971805
GGML_ASSERT(sched);
17981806
int backend_index = ggml_backend_sched_backend_id(sched, backend);

include/llama.h

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,24 +1329,25 @@ extern "C" {
13291329
//
13301330
// Performance utils
13311331
//
1332-
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
1332+
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
13331333
//
13341334

13351335
struct llama_perf_context_data {
1336-
double t_start_ms;
1337-
double t_load_ms;
1338-
double t_p_eval_ms;
1339-
double t_eval_ms;
1340-
1341-
int32_t n_p_eval;
1342-
int32_t n_eval;
1343-
int32_t n_reused; // number of times a ggml compute graph had been reused
1336+
// ms == milliseconds
1337+
double t_start_ms; // absolute start time
1338+
double t_load_ms; // time needed for loading the model
1339+
double t_p_eval_ms; // time needed for processing the prompt
1340+
double t_eval_ms; // time needed for generating tokens
1341+
1342+
int32_t n_p_eval; // number of prompt tokens
1343+
int32_t n_eval; // number of generated tokens
1344+
int32_t n_reused; // number of times a ggml compute graph had been reused
13441345
};
13451346

13461347
struct llama_perf_sampler_data {
1347-
double t_sample_ms;
1348+
double t_sample_ms; // time needed for sampling in ms
13481349

1349-
int32_t n_sample;
1350+
int32_t n_sample; // number of sampled tokens
13501351
};
13511352

13521353
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
@@ -1358,6 +1359,9 @@ extern "C" {
13581359
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
13591360
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
13601361

1362+
// print a breakdown of per-device memory use via LLAMA_LOG:
1363+
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
1364+
13611365
//
13621366
// training
13631367
//

src/llama-context.cpp

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
20272027
n_reused = 0;
20282028
}
20292029

2030+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
2031+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
2032+
for (const auto & buft_size : model.memory_breakdown()) {
2033+
ret[buft_size.first].model += buft_size.second;
2034+
}
2035+
for (const auto & buft_size : memory->memory_breakdown()) {
2036+
ret[buft_size.first].context += buft_size.second;
2037+
}
2038+
for (const auto & backend_ptr : backends) {
2039+
ggml_backend_t backend = backend_ptr.get();
2040+
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
2041+
}
2042+
return ret;
2043+
}
2044+
20302045
//
20312046
// training
20322047
//
@@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) {
27652780
ctx->perf_reset();
27662781
}
27672782

2783+
void llama_memory_breakdown_print(const struct llama_context * ctx) {
2784+
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
2785+
2786+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
2787+
2788+
std::vector<std::array<std::string, 9>> table_data;
2789+
table_data.reserve(devices.size());
2790+
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
2791+
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
2792+
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
2793+
2794+
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
2795+
2796+
constexpr size_t MiB = 1024 * 1024;
2797+
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
2798+
2799+
// track seen buffer types to avoid double counting:
2800+
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
2801+
2802+
// accumulative memory breakdown for each device and for host:
2803+
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
2804+
llama_memory_breakdown_data mb_host;
2805+
2806+
for (const auto & buft_mb : memory_breakdown) {
2807+
ggml_backend_buffer_type_t buft = buft_mb.first;
2808+
const llama_memory_breakdown_data & mb = buft_mb.second;
2809+
if (ggml_backend_buft_is_host(buft)) {
2810+
mb_host.model += mb.model;
2811+
mb_host.context += mb.context;
2812+
mb_host.compute += mb.compute;
2813+
seen_buffer_types.insert(buft);
2814+
continue;
2815+
}
2816+
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
2817+
if (dev) {
2818+
int i_dev = -1;
2819+
for (size_t i = 0; i < devices.size(); i++) {
2820+
if (devices[i] == dev) {
2821+
i_dev = i;
2822+
break;
2823+
}
2824+
}
2825+
if (i_dev != -1) {
2826+
mb_dev[i_dev].model += mb.model;
2827+
mb_dev[i_dev].context += mb.context;
2828+
mb_dev[i_dev].compute += mb.compute;
2829+
seen_buffer_types.insert(buft);
2830+
continue;
2831+
}
2832+
}
2833+
}
2834+
2835+
// print memory breakdown for each device:
2836+
for (size_t i = 0; i < devices.size(); i++) {
2837+
ggml_backend_dev_t dev = devices[i];
2838+
llama_memory_breakdown_data mb = mb_dev[i];
2839+
2840+
const std::string name = ggml_backend_dev_name(dev);
2841+
std::string desc = ggml_backend_dev_description(dev);
2842+
for (const std::string & prefix : desc_prefixes_strip) {
2843+
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
2844+
desc = desc.substr(prefix.length());
2845+
}
2846+
}
2847+
2848+
size_t free, total;
2849+
ggml_backend_dev_memory(dev, &free, &total);
2850+
2851+
const size_t self = mb.model + mb.context + mb.compute;
2852+
const size_t unaccounted = total - self - free;
2853+
2854+
table_data.push_back({
2855+
template_gpu,
2856+
" - " + name + " (" + desc + ")",
2857+
std::to_string(total / MiB),
2858+
std::to_string(free / MiB),
2859+
std::to_string(self / MiB),
2860+
std::to_string(mb.model / MiB),
2861+
std::to_string(mb.context / MiB),
2862+
std::to_string(mb.compute / MiB),
2863+
std::to_string(unaccounted / MiB)});
2864+
}
2865+
2866+
// print memory breakdown for host:
2867+
{
2868+
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
2869+
table_data.push_back({
2870+
template_other,
2871+
" - Host",
2872+
"", // total
2873+
"", // free
2874+
std::to_string(self / MiB),
2875+
std::to_string(mb_host.model / MiB),
2876+
std::to_string(mb_host.context / MiB),
2877+
std::to_string(mb_host.compute / MiB),
2878+
""}); // unaccounted
2879+
}
2880+
2881+
// print memory breakdown for all remaining buffer types:
2882+
for (const auto & buft_mb : memory_breakdown) {
2883+
ggml_backend_buffer_type_t buft = buft_mb.first;
2884+
const llama_memory_breakdown_data & mb = buft_mb.second;
2885+
if (seen_buffer_types.count(buft) == 1) {
2886+
continue;
2887+
}
2888+
const std::string name = ggml_backend_buft_name(buft);
2889+
const size_t self = mb.model + mb.context + mb.compute;
2890+
table_data.push_back({
2891+
template_other,
2892+
" - " + name,
2893+
"", // total
2894+
"", // free
2895+
std::to_string(self / MiB),
2896+
std::to_string(mb.model / MiB),
2897+
std::to_string(mb.context / MiB),
2898+
std::to_string(mb.compute / MiB),
2899+
""}); // unaccounted
2900+
seen_buffer_types.insert(buft);
2901+
}
2902+
2903+
for (size_t j = 1; j < table_data[0].size(); j++) {
2904+
size_t max_len = 0;
2905+
for (const auto & td : table_data) {
2906+
max_len = std::max(max_len, td[j].length());
2907+
}
2908+
for (auto & td : table_data) {
2909+
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
2910+
}
2911+
}
2912+
for (const auto & td : table_data) {
2913+
LLAMA_LOG_INFO(td[0].c_str(),
2914+
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
2915+
td[6].c_str(), td[7].c_str(), td[8].c_str());
2916+
}
2917+
}
2918+
27682919
//
27692920
// training
27702921
//

src/llama-context.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,17 @@ class llama_batch_allocr;
1717
class llama_io_read_i;
1818
class llama_io_write_i;
1919

20+
// "memory" as in abstract memory for the context
2021
struct llama_memory_i;
2122
struct llama_memory_context_i;
2223

24+
// "memory" as in physical memory for a buffer type, in bytes
25+
struct llama_memory_breakdown_data {
26+
size_t model = 0; // memory allocated for the model
27+
size_t context = 0; // memory allocated for the context
28+
size_t compute = 0; // memory allocated for temporary compute buffers
29+
};
30+
2331
struct llama_context {
2432
// init scheduler and compute buffers, reserve worst-case graphs
2533
llama_context(
@@ -144,6 +152,8 @@ struct llama_context {
144152
llama_perf_context_data perf_get_data() const;
145153
void perf_reset();
146154

155+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
156+
147157
//
148158
// training
149159
//

src/llama-kv-cache-iswa.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
113113
return kv_swa->seq_pos_max(seq_id);
114114
}
115115

116+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
117+
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
118+
for (const auto & buft_size : kv_swa->memory_breakdown()) {
119+
mb[buft_size.first] += buft_size.second;
120+
}
121+
return mb;
122+
}
123+
116124
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
117125
GGML_UNUSED(embd_all);
118126

src/llama-kv-cache-iswa.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ class llama_kv_cache_iswa : public llama_memory_i {
5656
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
5757
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
5858

59+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
60+
5961
// state write/load
6062

6163
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;

src/llama-kv-cache.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
473473
return cells.seq_pos_max(seq_id);
474474
}
475475

476+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
477+
std::map<ggml_backend_buffer_type_t, size_t> ret;
478+
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
479+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
480+
}
481+
return ret;
482+
}
483+
476484
llama_memory_context_ptr llama_kv_cache::init_batch(
477485
llama_batch_allocr & balloc,
478486
uint32_t n_ubatch,

src/llama-kv-cache.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ class llama_kv_cache : public llama_memory_i {
121121
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
122122
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
123123

124+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
125+
124126
// state write/load
125127

126128
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;

0 commit comments

Comments
 (0)