diff --git a/common/sampling.cpp b/common/sampling.cpp index c710ee173c0ed..c69d525b5b358 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam } if (ctx) { llama_perf_context_print(ctx); + llama_memory_breakdown_print(ctx); } } diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index ab297e0c6f69f..62b6d65e51445 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -314,7 +314,8 @@ extern "C" { GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); - GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); + GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend); + GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 7646f3f1346a4..f32a879d9a4b6 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1793,6 +1793,14 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) return sched->backends[i]; } +ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) { + GGML_ASSERT(sched); + int backend_index = ggml_backend_sched_backend_id(sched, backend); + GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); + + return sched->bufts[backend_index]; +} + size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { GGML_ASSERT(sched); int backend_index = ggml_backend_sched_backend_id(sched, backend); diff --git a/include/llama.h b/include/llama.h index 453190e852b51..452d9ec5bf285 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1329,24 +1329,25 @@ extern "C" { // // Performance utils // - // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. + // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements. // struct llama_perf_context_data { - double t_start_ms; - double t_load_ms; - double t_p_eval_ms; - double t_eval_ms; - - int32_t n_p_eval; - int32_t n_eval; - int32_t n_reused; // number of times a ggml compute graph had been reused + // ms == milliseconds + double t_start_ms; // absolute start time + double t_load_ms; // time needed for loading the model + double t_p_eval_ms; // time needed for processing the prompt + double t_eval_ms; // time needed for generating tokens + + int32_t n_p_eval; // number of prompt tokens + int32_t n_eval; // number of generated tokens + int32_t n_reused; // number of times a ggml compute graph had been reused }; struct llama_perf_sampler_data { - double t_sample_ms; + double t_sample_ms; // time needed for sampling in ms - int32_t n_sample; + int32_t n_sample; // number of sampled tokens }; LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx); @@ -1358,6 +1359,9 @@ extern "C" { LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); + // print a breakdown of per-device memory use via LLAMA_LOG: + LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx); + // // training // diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 289a32b6d3473..49386e85665f9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2027,6 +2027,21 @@ void llama_context::perf_reset() { n_reused = 0; } +std::map llama_context::memory_breakdown() const { + std::map ret; + for (const auto & buft_size : model.memory_breakdown()) { + ret[buft_size.first].model += buft_size.second; + } + for (const auto & buft_size : memory->memory_breakdown()) { + ret[buft_size.first].context += buft_size.second; + } + for (const auto & backend_ptr : backends) { + ggml_backend_t backend = backend_ptr.get(); + ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend); + } + return ret; +} + // // training // @@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) { ctx->perf_reset(); } +void llama_memory_breakdown_print(const struct llama_context * ctx) { + const std::vector & devices = ctx->get_model().devices; + + std::map memory_breakdown = ctx->memory_breakdown(); + + std::vector> table_data; + table_data.reserve(devices.size()); + const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n"; + const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n"; + const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n"; + + table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"}); + + constexpr size_t MiB = 1024 * 1024; + const std::vector desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "}; + + // track seen buffer types to avoid double counting: + std::set seen_buffer_types; + + // accumulative memory breakdown for each device and for host: + std::vector mb_dev(devices.size()); + llama_memory_breakdown_data mb_host; + + for (const auto & buft_mb : memory_breakdown) { + ggml_backend_buffer_type_t buft = buft_mb.first; + const llama_memory_breakdown_data & mb = buft_mb.second; + if (ggml_backend_buft_is_host(buft)) { + mb_host.model += mb.model; + mb_host.context += mb.context; + mb_host.compute += mb.compute; + seen_buffer_types.insert(buft); + continue; + } + ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); + if (dev) { + int i_dev = -1; + for (size_t i = 0; i < devices.size(); i++) { + if (devices[i] == dev) { + i_dev = i; + break; + } + } + if (i_dev != -1) { + mb_dev[i_dev].model += mb.model; + mb_dev[i_dev].context += mb.context; + mb_dev[i_dev].compute += mb.compute; + seen_buffer_types.insert(buft); + continue; + } + } + } + + // print memory breakdown for each device: + for (size_t i = 0; i < devices.size(); i++) { + ggml_backend_dev_t dev = devices[i]; + llama_memory_breakdown_data mb = mb_dev[i]; + + const std::string name = ggml_backend_dev_name(dev); + std::string desc = ggml_backend_dev_description(dev); + for (const std::string & prefix : desc_prefixes_strip) { + if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) { + desc = desc.substr(prefix.length()); + } + } + + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + + const size_t self = mb.model + mb.context + mb.compute; + const size_t unaccounted = total - self - free; + + table_data.push_back({ + template_gpu, + " - " + name + " (" + desc + ")", + std::to_string(total / MiB), + std::to_string(free / MiB), + std::to_string(self / MiB), + std::to_string(mb.model / MiB), + std::to_string(mb.context / MiB), + std::to_string(mb.compute / MiB), + std::to_string(unaccounted / MiB)}); + } + + // print memory breakdown for host: + { + const size_t self = mb_host.model + mb_host.context + mb_host.compute; + table_data.push_back({ + template_other, + " - Host", + "", // total + "", // free + std::to_string(self / MiB), + std::to_string(mb_host.model / MiB), + std::to_string(mb_host.context / MiB), + std::to_string(mb_host.compute / MiB), + ""}); // unaccounted + } + + // print memory breakdown for all remaining buffer types: + for (const auto & buft_mb : memory_breakdown) { + ggml_backend_buffer_type_t buft = buft_mb.first; + const llama_memory_breakdown_data & mb = buft_mb.second; + if (seen_buffer_types.count(buft) == 1) { + continue; + } + const std::string name = ggml_backend_buft_name(buft); + const size_t self = mb.model + mb.context + mb.compute; + table_data.push_back({ + template_other, + " - " + name, + "", // total + "", // free + std::to_string(self / MiB), + std::to_string(mb.model / MiB), + std::to_string(mb.context / MiB), + std::to_string(mb.compute / MiB), + ""}); // unaccounted + seen_buffer_types.insert(buft); + } + + for (size_t j = 1; j < table_data[0].size(); j++) { + size_t max_len = 0; + for (const auto & td : table_data) { + max_len = std::max(max_len, td[j].length()); + } + for (auto & td : table_data) { + td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' '); + } + } + for (const auto & td : table_data) { + LLAMA_LOG_INFO(td[0].c_str(), + __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(), + td[6].c_str(), td[7].c_str(), td[8].c_str()); + } +} + // // training // diff --git a/src/llama-context.h b/src/llama-context.h index f23aa8ee1368d..ed6d82cb396f9 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -17,9 +17,17 @@ class llama_batch_allocr; class llama_io_read_i; class llama_io_write_i; +// "memory" as in abstract memory for the context struct llama_memory_i; struct llama_memory_context_i; +// "memory" as in physical memory for a buffer type, in bytes +struct llama_memory_breakdown_data { + size_t model = 0; // memory allocated for the model + size_t context = 0; // memory allocated for the context + size_t compute = 0; // memory allocated for temporary compute buffers +}; + struct llama_context { // init scheduler and compute buffers, reserve worst-case graphs llama_context( @@ -144,6 +152,8 @@ struct llama_context { llama_perf_context_data perf_get_data() const; void perf_reset(); + std::map memory_breakdown() const; + // // training // diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index d7342914c6b7c..827302e6d25bd 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const { return kv_swa->seq_pos_max(seq_id); } +std::map llama_kv_cache_iswa::memory_breakdown() const { + std::map mb = kv_base->memory_breakdown(); + for (const auto & buft_size : kv_swa->memory_breakdown()) { + mb[buft_size.first] += buft_size.second; + } + return mb; +} + llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { GGML_UNUSED(embd_all); diff --git a/src/llama-kv-cache-iswa.h b/src/llama-kv-cache-iswa.h index 5ed134b795800..70ab22f0d6086 100644 --- a/src/llama-kv-cache-iswa.h +++ b/src/llama-kv-cache-iswa.h @@ -56,6 +56,8 @@ class llama_kv_cache_iswa : public llama_memory_i { llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + std::map memory_breakdown() const override; + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 885be072a75c8..816f2d5de592b 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { return cells.seq_pos_max(seq_id); } +std::map llama_kv_cache::memory_breakdown() const { + std::map ret; + for (const ggml_backend_buffer_ptr & buf_ptr : bufs) { + ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get()); + } + return ret; +} + llama_memory_context_ptr llama_kv_cache::init_batch( llama_batch_allocr & balloc, uint32_t n_ubatch, diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 30de013f5f7f3..85f0663d8c1d4 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -121,6 +121,8 @@ class llama_kv_cache : public llama_memory_i { llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + std::map memory_breakdown() const override; + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override; diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index ba61ebaa885fe..abf652483c202 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const { return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id)); } +std::map llama_memory_hybrid::memory_breakdown() const { + std::map mb = mem_attn->memory_breakdown(); + for (const auto & buft_size : mem_recr->memory_breakdown()) { + mb[buft_size.first] += buft_size.second; + } + return mb; +} + void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { GGML_UNUSED(flags); diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h index 11a3565178297..558cafdf984c9 100644 --- a/src/llama-memory-hybrid.h +++ b/src/llama-memory-hybrid.h @@ -68,6 +68,8 @@ class llama_memory_hybrid : public llama_memory_i { llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + std::map memory_breakdown() const override; + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override; diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index 08716ed91aed1..44645fcdd2d48 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const { return result; } +std::map llama_memory_recurrent::memory_breakdown() const { + std::map ret; + for (const ggml_backend_buffer_ptr & buf_ptr : bufs) { + ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get()); + } + return ret; +} + llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { do { balloc.split_reset(); diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h index c4daf00495bc2..077c6e3ce938d 100644 --- a/src/llama-memory-recurrent.h +++ b/src/llama-memory-recurrent.h @@ -4,6 +4,7 @@ #include "llama-graph.h" #include "llama-memory.h" +#include #include #include @@ -50,6 +51,8 @@ class llama_memory_recurrent : public llama_memory_i { llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + std::map memory_breakdown() const override; + bool prepare(const std::vector & ubatches); // find a contiguous slot of memory cells and emplace the ubatch there diff --git a/src/llama-memory.h b/src/llama-memory.h index ccd1f073b0848..4a157b91fdbdc 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -2,6 +2,7 @@ #include "llama.h" +#include #include #include @@ -108,6 +109,8 @@ struct llama_memory_i { virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0; virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0; + virtual std::map memory_breakdown() const = 0; + // // state write/read // diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 818b209641a5a..1e84c54fa3c8d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -5903,6 +5903,14 @@ size_t llama_model::n_devices() const { return devices.size(); } +std::map llama_model::memory_breakdown() const { + std::map ret; + for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) { + ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get()); + } + return ret; +} + uint64_t llama_model::n_elements() const { return pimpl->n_elements; } diff --git a/src/llama-model.h b/src/llama-model.h index 10b1767f27228..80aff5f2f2d1a 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -7,6 +7,7 @@ #include "llama-memory.h" #include "llama-vocab.h" +#include #include #include #include @@ -449,10 +450,12 @@ struct llama_model { std::string desc() const; - size_t size() const; + size_t size() const; // file size size_t n_tensors() const; size_t n_devices() const; + std::map memory_breakdown() const; + // total number of parameters in the model uint64_t n_elements() const; diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp index 80cbb095da4cb..e1b1a5fe0dbd7 100644 --- a/tools/perplexity/perplexity.cpp +++ b/tools/perplexity/perplexity.cpp @@ -2060,6 +2060,7 @@ int main(int argc, char ** argv) { LOG("\n"); llama_perf_context_print(ctx); + llama_memory_breakdown_print(ctx); llama_backend_free();