11#include " llama-context.h"
22
3+ #include " ggml-backend.h"
34#include " llama-impl.h"
45#include " llama-batch.h"
56#include " llama-io.h"
@@ -374,8 +375,9 @@ llama_context::llama_context(
374375 }
375376
376377 for (size_t i = 0 ; i < backend_ptrs.size (); ++i) {
377- ggml_backend_buffer_type_t buft = backend_buft[i];
378- size_t size = ggml_backend_sched_get_buffer_size (sched.get (), buft);
378+ ggml_backend_t backend = backend_ptrs[i];
379+ ggml_backend_buffer_type_t buft = backend_buft[i];
380+ size_t size = ggml_backend_sched_get_buffer_size (sched.get (), backend);
379381 if (size > 1 ) {
380382 LLAMA_LOG_INFO (" %s: %10s compute buffer size = %8.2f MiB\n " , __func__,
381383 ggml_backend_buft_name (buft),
@@ -2029,26 +2031,23 @@ void llama_context::perf_reset() {
20292031std::map<ggml_backend_buffer_type_t , llama_memory_breakdown_data> llama_context::memory_breakdown () const {
20302032 std::map<ggml_backend_buffer_type_t , llama_memory_breakdown_data> ret;
20312033
2032- auto get_memory_breakdown = [&](ggml_backend_buffer_type_t buft) {
2033- llama_memory_breakdown_data data;
2034- data.model = model.memory_use (buft);
2035- data.context = memory->memory_use (buft);
2036- data.compute = ggml_backend_sched_get_buffer_size (sched.get (), buft);
2037- return data;
2038- };
2039-
20402034 for (const auto & backend_ptr : backends) {
2041- ggml_backend_t backend = backend_ptr.get ();
2042- ggml_backend_dev_t dev = ggml_backend_get_device (backend);
2043-
2044- ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type (backend);
2045- ret[buft] = get_memory_breakdown (buft);
2046-
2047- ggml_backend_buffer_type_t buft_host = ggml_backend_dev_host_buffer_type (dev);
2048- if (!buft_host) {
2049- continue ;
2035+ ggml_backend_t backend = backend_ptr.get ();
2036+ { // memory allocated statically on device of the backend itself
2037+ ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type (backend);
2038+ ret[buft] = {model.memory_use (buft), memory->memory_use (buft), 0 };
2039+ }
2040+ { // memory allocated on host for backend
2041+ ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type (ggml_backend_get_device (backend));
2042+ if (ret.count (buft) != 0 ) {
2043+ continue ; // multiple backends may use the same host buffer type
2044+ }
2045+ ret[buft] = {model.memory_use (buft), memory->memory_use (buft), 0 };
20502046 }
2051- ret[buft_host] = get_memory_breakdown (buft_host);
2047+ }
2048+ for (const auto & backend_ptr : backends) {
2049+ ggml_backend_t backend = backend_ptr.get ();
2050+ ret[ggml_backend_sched_get_buffer_type (sched.get (), backend)].compute += ggml_backend_sched_get_buffer_size (sched.get (), backend);
20522051 }
20532052 return ret;
20542053}
@@ -2808,10 +2807,9 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
28082807 const std::vector<std::string> desc_prefixes_strip = {" NVIDIA " , " GeForce " , " Tesla " , " AMD " , " Radeon " , " Instinct " };
28092808
28102809 // "free" host memory is poorly defined, instead track only memory that we know is being used:
2811- size_t model_host = 0 ;
2812- size_t context_host = 0 ;
2813- size_t compute_host = 0 ;
2814- std::set<ggml_backend_buffer_type_t > seen_host_buffer_types; // track seen host buffer types to avoid double counting
2810+ llama_memory_breakdown_data mb_host_acc = memory_breakdown[ggml_backend_cpu_buffer_type ()];
2811+ // track seen host buffer types to avoid double counting:
2812+ std::set<ggml_backend_buffer_type_t > seen_host_buffer_types = {ggml_backend_cpu_buffer_type ()};
28152813
28162814 for (const ggml_backend_dev_t & dev : devices) {
28172815 ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type (dev);
@@ -2847,9 +2845,9 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
28472845 if (seen_host_buffer_types.count (buft) == 1 ) {
28482846 continue ;
28492847 }
2850- model_host += mb.model ;
2851- context_host += mb.context ;
2852- compute_host += mb.compute ;
2848+ mb_host_acc. model += mb.model ;
2849+ mb_host_acc. context += mb.context ;
2850+ mb_host_acc. compute += mb.compute ;
28532851 seen_host_buffer_types.insert (buft);
28542852 }
28552853
@@ -2861,21 +2859,21 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
28612859 continue ;
28622860 }
28632861 const llama_memory_breakdown_data & mb_host = memory_breakdown[buft_host];
2864- model_host += mb_host.model ;
2865- context_host += mb_host.context ;
2866- compute_host += mb_host.compute ;
2862+ mb_host_acc. model += mb_host.model ;
2863+ mb_host_acc. context += mb_host.context ;
2864+ mb_host_acc. compute += mb_host.compute ;
28672865 seen_host_buffer_types.insert (buft_host);
28682866 }
2869- const size_t self_host = model_host + context_host + compute_host ;
2867+ const size_t self_host = mb_host_acc. model + mb_host_acc. context + mb_host_acc. compute ;
28702868 table_data.push_back ({
28712869 template_host,
28722870 " - Host" ,
28732871 " " , // total
28742872 " " , // free
28752873 std::to_string (self_host / MiB),
2876- std::to_string (model_host / MiB),
2877- std::to_string (context_host / MiB),
2878- std::to_string (compute_host / MiB),
2874+ std::to_string (mb_host_acc. model / MiB),
2875+ std::to_string (mb_host_acc. context / MiB),
2876+ std::to_string (mb_host_acc. compute / MiB),
28792877 " " }); // unaccounted
28802878
28812879 for (size_t j = 1 ; j < table_data[0 ].size (); j++) {
0 commit comments