Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
}
if (ctx) {
llama_perf_context_print(ctx);
llama_memory_breakdown_print(ctx);
}
}

Expand Down
3 changes: 2 additions & 1 deletion ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,8 @@ extern "C" {
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);

GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1793,6 +1793,14 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
return sched->backends[i];
}

ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);

return sched->bufts[backend_index];
}

size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);
Expand Down
26 changes: 15 additions & 11 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -1329,24 +1329,25 @@ extern "C" {
//
// Performance utils
//
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
//

struct llama_perf_context_data {
double t_start_ms;
double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;

int32_t n_p_eval;
int32_t n_eval;
int32_t n_reused; // number of times a ggml compute graph had been reused
// ms == milliseconds
double t_start_ms; // absolute start time
double t_load_ms; // time needed for loading the model
double t_p_eval_ms; // time needed for processing the prompt
double t_eval_ms; // time needed for generating tokens

int32_t n_p_eval; // number of prompt tokens
int32_t n_eval; // number of generated tokens
int32_t n_reused; // number of times a ggml compute graph had been reused
};

struct llama_perf_sampler_data {
double t_sample_ms;
double t_sample_ms; // time needed for sampling in ms

int32_t n_sample;
int32_t n_sample; // number of sampled tokens
};

LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
Expand All @@ -1358,6 +1359,9 @@ extern "C" {
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);

// print a breakdown of per-device memory use via LLAMA_LOG:
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);

//
// training
//
Expand Down
151 changes: 151 additions & 0 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
n_reused = 0;
}

std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
for (const auto & buft_size : model.memory_breakdown()) {
ret[buft_size.first].model += buft_size.second;
}
for (const auto & buft_size : memory->memory_breakdown()) {
ret[buft_size.first].context += buft_size.second;
}
for (const auto & backend_ptr : backends) {
ggml_backend_t backend = backend_ptr.get();
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
}
return ret;
}

//
// training
//
Expand Down Expand Up @@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) {
ctx->perf_reset();
}

void llama_memory_breakdown_print(const struct llama_context * ctx) {
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;

std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();

std::vector<std::array<std::string, 9>> table_data;
table_data.reserve(devices.size());
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";

table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});

constexpr size_t MiB = 1024 * 1024;
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};

// track seen buffer types to avoid double counting:
std::set<ggml_backend_buffer_type_t> seen_buffer_types;

// accumulative memory breakdown for each device and for host:
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
llama_memory_breakdown_data mb_host;

for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (ggml_backend_buft_is_host(buft)) {
mb_host.model += mb.model;
mb_host.context += mb.context;
mb_host.compute += mb.compute;
seen_buffer_types.insert(buft);
continue;
}
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (dev) {
int i_dev = -1;
for (size_t i = 0; i < devices.size(); i++) {
if (devices[i] == dev) {
i_dev = i;
break;
}
}
if (i_dev != -1) {
mb_dev[i_dev].model += mb.model;
mb_dev[i_dev].context += mb.context;
mb_dev[i_dev].compute += mb.compute;
seen_buffer_types.insert(buft);
continue;
}
}
}

// print memory breakdown for each device:
for (size_t i = 0; i < devices.size(); i++) {
ggml_backend_dev_t dev = devices[i];
llama_memory_breakdown_data mb = mb_dev[i];

const std::string name = ggml_backend_dev_name(dev);
std::string desc = ggml_backend_dev_description(dev);
for (const std::string & prefix : desc_prefixes_strip) {
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
desc = desc.substr(prefix.length());
}
}

size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);

const size_t self = mb.model + mb.context + mb.compute;
const size_t unaccounted = total - self - free;

table_data.push_back({
template_gpu,
" - " + name + " (" + desc + ")",
std::to_string(total / MiB),
std::to_string(free / MiB),
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
std::to_string(unaccounted / MiB)});
}

// print memory breakdown for host:
{
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
table_data.push_back({
template_other,
" - Host",
"", // total
"", // free
std::to_string(self / MiB),
std::to_string(mb_host.model / MiB),
std::to_string(mb_host.context / MiB),
std::to_string(mb_host.compute / MiB),
""}); // unaccounted
}

// print memory breakdown for all remaining buffer types:
for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (seen_buffer_types.count(buft) == 1) {
continue;
}
const std::string name = ggml_backend_buft_name(buft);
const size_t self = mb.model + mb.context + mb.compute;
table_data.push_back({
template_other,
" - " + name,
"", // total
"", // free
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
""}); // unaccounted
seen_buffer_types.insert(buft);
}

for (size_t j = 1; j < table_data[0].size(); j++) {
size_t max_len = 0;
for (const auto & td : table_data) {
max_len = std::max(max_len, td[j].length());
}
for (auto & td : table_data) {
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
}
}
for (const auto & td : table_data) {
LLAMA_LOG_INFO(td[0].c_str(),
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
td[6].c_str(), td[7].c_str(), td[8].c_str());
}
}

//
// training
//
Expand Down
10 changes: 10 additions & 0 deletions src/llama-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,17 @@ class llama_batch_allocr;
class llama_io_read_i;
class llama_io_write_i;

// "memory" as in abstract memory for the context
struct llama_memory_i;
struct llama_memory_context_i;

// "memory" as in physical memory for a buffer type, in bytes
struct llama_memory_breakdown_data {
size_t model = 0; // memory allocated for the model
size_t context = 0; // memory allocated for the context
size_t compute = 0; // memory allocated for temporary compute buffers
};

struct llama_context {
// init scheduler and compute buffers, reserve worst-case graphs
llama_context(
Expand Down Expand Up @@ -144,6 +152,8 @@ struct llama_context {
llama_perf_context_data perf_get_data() const;
void perf_reset();

std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;

//
// training
//
Expand Down
8 changes: 8 additions & 0 deletions src/llama-kv-cache-iswa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
return kv_swa->seq_pos_max(seq_id);
}

std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
for (const auto & buft_size : kv_swa->memory_breakdown()) {
mb[buft_size.first] += buft_size.second;
}
return mb;
}

llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
GGML_UNUSED(embd_all);

Expand Down
2 changes: 2 additions & 0 deletions src/llama-kv-cache-iswa.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ class llama_kv_cache_iswa : public llama_memory_i {
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;

std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;

// state write/load

void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
Expand Down
8 changes: 8 additions & 0 deletions src/llama-kv-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
return cells.seq_pos_max(seq_id);
}

std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
}
return ret;
}

llama_memory_context_ptr llama_kv_cache::init_batch(
llama_batch_allocr & balloc,
uint32_t n_ubatch,
Expand Down
2 changes: 2 additions & 0 deletions src/llama-kv-cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ class llama_kv_cache : public llama_memory_i {
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;

std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;

// state write/load

void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
Expand Down
8 changes: 8 additions & 0 deletions src/llama-memory-hybrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
}

std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
for (const auto & buft_size : mem_recr->memory_breakdown()) {
mb[buft_size.first] += buft_size.second;
}
return mb;
}

void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
GGML_UNUSED(flags);

Expand Down
2 changes: 2 additions & 0 deletions src/llama-memory-hybrid.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class llama_memory_hybrid : public llama_memory_i {
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;

std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;

// state write/load

void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
Expand Down
8 changes: 8 additions & 0 deletions src/llama-memory-recurrent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
return result;
}

std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
}
return ret;
}

llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
do {
balloc.split_reset();
Expand Down
3 changes: 3 additions & 0 deletions src/llama-memory-recurrent.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "llama-graph.h"
#include "llama-memory.h"

#include <map>
#include <set>
#include <vector>

Expand Down Expand Up @@ -50,6 +51,8 @@ class llama_memory_recurrent : public llama_memory_i {
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;

std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;

bool prepare(const std::vector<llama_ubatch> & ubatches);

// find a contiguous slot of memory cells and emplace the ubatch there
Expand Down
3 changes: 3 additions & 0 deletions src/llama-memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "llama.h"

#include <map>
#include <memory>
#include <functional>

Expand Down Expand Up @@ -108,6 +109,8 @@ struct llama_memory_i {
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;

virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;

//
// state write/read
//
Expand Down
Loading
Loading