@@ -2030,20 +2030,11 @@ void llama_context::perf_reset() {
20302030
20312031std::map<ggml_backend_buffer_type_t , llama_memory_breakdown_data> llama_context::memory_breakdown () const {
20322032 std::map<ggml_backend_buffer_type_t , llama_memory_breakdown_data> ret;
2033-
2034- for (const auto & backend_ptr : backends) {
2035- ggml_backend_t backend = backend_ptr.get ();
2036- { // memory allocated statically on device of the backend itself
2037- ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type (backend);
2038- ret[buft] = {model.memory_use (buft), memory->memory_use (buft), 0 };
2039- }
2040- { // memory allocated on host for backend
2041- ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type (ggml_backend_get_device (backend));
2042- if (ret.count (buft) != 0 ) {
2043- continue ; // multiple backends may use the same host buffer type
2044- }
2045- ret[buft] = {model.memory_use (buft), memory->memory_use (buft), 0 };
2046- }
2033+ for (const auto & buft_size : model.memory_breakdown ()) {
2034+ ret[buft_size.first ].model += buft_size.second ;
2035+ }
2036+ for (const auto & buft_size : memory->memory_breakdown ()) {
2037+ ret[buft_size.first ].context += buft_size.second ;
20472038 }
20482039 for (const auto & backend_ptr : backends) {
20492040 ggml_backend_t backend = backend_ptr.get ();
@@ -2806,74 +2797,70 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
28062797 constexpr size_t MiB = 1024 * 1024 ;
28072798 const std::vector<std::string> desc_prefixes_strip = {" NVIDIA " , " GeForce " , " Tesla " , " AMD " , " Radeon " , " Instinct " };
28082799
2809- // "free" host memory is poorly defined, instead track only memory that we know is being used:
2810- llama_memory_breakdown_data mb_host_acc = memory_breakdown[ggml_backend_cpu_buffer_type ()];
28112800 // track seen host buffer types to avoid double counting:
2812- std::set<ggml_backend_buffer_type_t > seen_host_buffer_types = { ggml_backend_cpu_buffer_type ()} ;
2801+ std::set<ggml_backend_buffer_type_t > seen_host_buffer_types;
28132802
2803+ // GPU devices have their own memory, print a breakdown for each GPU on a single line:
28142804 for (const ggml_backend_dev_t & dev : devices) {
2805+ if (ggml_backend_dev_type (dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
2806+ continue ;
2807+ }
28152808 ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type (dev);
2816-
28172809 const llama_memory_breakdown_data & mb = memory_breakdown[buft];
28182810
2819- if (ggml_backend_dev_type (dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
2820- const std::string name = ggml_backend_buft_name (buft);
2821- std::string desc = ggml_backend_dev_description (dev);
2822- for (const std::string & prefix : desc_prefixes_strip) {
2823- if (desc.length () >= prefix.length () && desc.substr (0 , prefix.length ()) == prefix) {
2824- desc = desc.substr (prefix.length ());
2825- }
2811+ const std::string name = ggml_backend_buft_name (buft);
2812+ std::string desc = ggml_backend_dev_description (dev);
2813+ for (const std::string & prefix : desc_prefixes_strip) {
2814+ if (desc.length () >= prefix.length () && desc.substr (0 , prefix.length ()) == prefix) {
2815+ desc = desc.substr (prefix.length ());
28262816 }
2827-
2828- size_t free, total;
2829- ggml_backend_dev_memory (dev, &free, &total);
2830-
2831- const size_t self = mb.model + mb.context + mb.compute ;
2832- const size_t unaccounted = total - self - free;
2833-
2834- table_data.push_back ({
2835- template_gpu,
2836- " - " + name + " (" + desc + " )" ,
2837- std::to_string (total / MiB),
2838- std::to_string (free / MiB),
2839- std::to_string (self / MiB),
2840- std::to_string (mb.model / MiB),
2841- std::to_string (mb.context / MiB),
2842- std::to_string (mb.compute / MiB),
2843- std::to_string (unaccounted / MiB)});
2844- } else {
2845- if (seen_host_buffer_types.count (buft) == 1 ) {
2846- continue ;
2847- }
2848- mb_host_acc.model += mb.model ;
2849- mb_host_acc.context += mb.context ;
2850- mb_host_acc.compute += mb.compute ;
2851- seen_host_buffer_types.insert (buft);
28522817 }
28532818
2854- ggml_backend_buffer_type_t buft_host = ggml_backend_dev_host_buffer_type (dev);
2855- if (!buft_host) {
2856- continue ;
2857- }
2858- if (seen_host_buffer_types.count (buft_host) == 1 ) {
2819+ size_t free, total;
2820+ ggml_backend_dev_memory (dev, &free, &total);
2821+
2822+ const size_t self = mb.model + mb.context + mb.compute ;
2823+ const size_t unaccounted = total - self - free;
2824+
2825+ table_data.push_back ({
2826+ template_gpu,
2827+ " - " + name + " (" + desc + " )" ,
2828+ std::to_string (total / MiB),
2829+ std::to_string (free / MiB),
2830+ std::to_string (self / MiB),
2831+ std::to_string (mb.model / MiB),
2832+ std::to_string (mb.context / MiB),
2833+ std::to_string (mb.compute / MiB),
2834+ std::to_string (unaccounted / MiB)});
2835+ seen_host_buffer_types.insert (buft);
2836+ }
2837+
2838+ // "free" host memory is poorly defined, instead track only memory that we know is being used:
2839+ llama_memory_breakdown_data mb_host = {0 , 0 , 0 };
2840+
2841+ // consolidate all memory buffers not on any of the models GPU devices as host memory:
2842+ for (const auto & buft_mb : memory_breakdown) {
2843+ ggml_backend_buffer_type_t buft = buft_mb.first ;
2844+ const llama_memory_breakdown_data & mb = buft_mb.second ;
2845+ if (seen_host_buffer_types.count (buft) == 1 ) {
28592846 continue ;
28602847 }
2861- const llama_memory_breakdown_data & mb_host = memory_breakdown[buft_host];
2862- mb_host_acc.model += mb_host.model ;
2863- mb_host_acc.context += mb_host.context ;
2864- mb_host_acc.compute += mb_host.compute ;
2865- seen_host_buffer_types.insert (buft_host);
2848+ mb_host.model += mb.model ;
2849+ mb_host.context += mb.context ;
2850+ mb_host.compute += mb.compute ;
2851+ seen_host_buffer_types.insert (buft);
28662852 }
2867- const size_t self_host = mb_host_acc.model + mb_host_acc.context + mb_host_acc.compute ;
2853+
2854+ const size_t self_host = mb_host.model + mb_host.context + mb_host.compute ;
28682855 table_data.push_back ({
28692856 template_host,
28702857 " - Host" ,
28712858 " " , // total
28722859 " " , // free
28732860 std::to_string (self_host / MiB),
2874- std::to_string (mb_host_acc .model / MiB),
2875- std::to_string (mb_host_acc .context / MiB),
2876- std::to_string (mb_host_acc .compute / MiB),
2861+ std::to_string (mb_host .model / MiB),
2862+ std::to_string (mb_host .context / MiB),
2863+ std::to_string (mb_host .compute / MiB),
28772864 " " }); // unaccounted
28782865
28792866 for (size_t j = 1 ; j < table_data[0 ].size (); j++) {
0 commit comments