| 
1 | 1 | #include "llama-context.h"  | 
2 | 2 | 
 
  | 
3 |  | -#include "ggml-backend.h"  | 
4 | 3 | #include "llama-impl.h"  | 
5 | 4 | #include "llama-batch.h"  | 
6 | 5 | #include "llama-io.h"  | 
@@ -2790,7 +2789,7 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {  | 
2790 | 2789 |     table_data.reserve(devices.size());  | 
2791 | 2790 |     const std::string template_header = "%s: | %s | %s   %s    %s   %s   %s   %s    %s |\n";  | 
2792 | 2791 |     const std::string template_gpu    = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";  | 
2793 |  | -    const std::string template_host   = "%s: | %s | %s   %s    %s = %s + %s + %s    %s |\n";  | 
 | 2792 | +    const std::string template_other  = "%s: | %s | %s   %s    %s = %s + %s + %s    %s |\n";  | 
2794 | 2793 | 
 
  | 
2795 | 2794 |     table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});  | 
2796 | 2795 | 
 
  | 
@@ -2835,34 +2834,29 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {  | 
2835 | 2834 |         seen_host_buffer_types.insert(buft);  | 
2836 | 2835 |     }  | 
2837 | 2836 | 
 
  | 
2838 |  | -    // "free" host memory is poorly defined, instead track only memory that we know is being used:  | 
2839 |  | -    llama_memory_breakdown_data mb_host = {0, 0, 0};  | 
2840 |  | - | 
2841 | 2837 |     // consolidate all memory buffers not on any of the models GPU devices as host memory:  | 
2842 | 2838 |     for (const auto & buft_mb : memory_breakdown) {  | 
2843 | 2839 |         ggml_backend_buffer_type_t          buft = buft_mb.first;  | 
2844 | 2840 |         const llama_memory_breakdown_data & mb   = buft_mb.second;  | 
2845 | 2841 |         if (seen_host_buffer_types.count(buft) == 1) {  | 
2846 | 2842 |             continue;  | 
2847 | 2843 |         }  | 
2848 |  | -        mb_host.model   += mb.model;  | 
2849 |  | -        mb_host.context += mb.context;  | 
2850 |  | -        mb_host.compute += mb.compute;  | 
 | 2844 | +        const std::string name = ggml_backend_buft_name(buft);  | 
 | 2845 | +        const size_t self = mb.model + mb.context + mb.compute;  | 
 | 2846 | +        table_data.push_back({  | 
 | 2847 | +            template_other,  | 
 | 2848 | +            "  - " + name,  | 
 | 2849 | +            "", // total  | 
 | 2850 | +            "", // free  | 
 | 2851 | +            std::to_string(self / MiB),  | 
 | 2852 | +            std::to_string(mb.model / MiB),  | 
 | 2853 | +            std::to_string(mb.context / MiB),  | 
 | 2854 | +            std::to_string(mb.compute / MiB),  | 
 | 2855 | +            ""}); // unaccounted  | 
 | 2856 | +        seen_host_buffer_types.insert(buft);  | 
2851 | 2857 |         seen_host_buffer_types.insert(buft);  | 
2852 | 2858 |     }  | 
2853 | 2859 | 
 
  | 
2854 |  | -    const size_t self_host = mb_host.model + mb_host.context + mb_host.compute;  | 
2855 |  | -    table_data.push_back({  | 
2856 |  | -        template_host,  | 
2857 |  | -        "  - Host",  | 
2858 |  | -        "", // total  | 
2859 |  | -        "", // free  | 
2860 |  | -        std::to_string(self_host / MiB),  | 
2861 |  | -        std::to_string(mb_host.model / MiB),  | 
2862 |  | -        std::to_string(mb_host.context / MiB),  | 
2863 |  | -        std::to_string(mb_host.compute / MiB),  | 
2864 |  | -        ""}); // unaccounted  | 
2865 |  | - | 
2866 | 2860 |     for (size_t j = 1; j < table_data[0].size(); j++) {  | 
2867 | 2861 |         size_t max_len = 0;  | 
2868 | 2862 |         for (const auto & td : table_data) {  | 
 | 
0 commit comments