Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
}
if (ctx) {
llama_perf_context_print(ctx);
llama_memory_breakdown_print(ctx);
}
}

Expand Down
3 changes: 2 additions & 1 deletion ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,8 @@ extern "C" {
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);

GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1793,6 +1793,14 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
return sched->backends[i];
}

ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);

return sched->bufts[backend_index];
}

size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);
Expand Down
26 changes: 15 additions & 11 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -1329,24 +1329,25 @@ extern "C" {
//
// Performance utils
//
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
//

struct llama_perf_context_data {
double t_start_ms;
double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;

int32_t n_p_eval;
int32_t n_eval;
int32_t n_reused; // number of times a ggml compute graph had been reused
// ms == milliseconds
double t_start_ms; // absolute start time
double t_load_ms; // time needed for loading the model
double t_p_eval_ms; // time needed for processing the prompt
double t_eval_ms; // time needed for generating tokens

int32_t n_p_eval; // number of prompt tokens
int32_t n_eval; // number of generated tokens
int32_t n_reused; // number of times a ggml compute graph had been reused
};

struct llama_perf_sampler_data {
double t_sample_ms;
double t_sample_ms; // time needed for sampling in ms

int32_t n_sample;
int32_t n_sample; // number of sampled tokens
};

LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
Expand All @@ -1358,6 +1359,9 @@ extern "C" {
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);

// print a breakdown of per-device memory use via LLAMA_LOG:
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);

//
// training
//
Expand Down
114 changes: 114 additions & 0 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "llama-context.h"

#include "ggml-backend.h"
#include "llama-impl.h"
#include "llama-batch.h"
#include "llama-io.h"
Expand Down Expand Up @@ -2027,6 +2028,21 @@ void llama_context::perf_reset() {
n_reused = 0;
}

std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
for (const auto & buft_size : model.memory_breakdown()) {
ret[buft_size.first].model += buft_size.second;
}
for (const auto & buft_size : memory->memory_breakdown()) {
ret[buft_size.first].context += buft_size.second;
}
for (const auto & backend_ptr : backends) {
ggml_backend_t backend = backend_ptr.get();
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
}
return ret;
}

//
// training
//
Expand Down Expand Up @@ -2765,6 +2781,104 @@ void llama_perf_context_reset(llama_context * ctx) {
ctx->perf_reset();
}

void llama_memory_breakdown_print(const struct llama_context * ctx) {
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;

std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();

std::vector<std::array<std::string, 9>> table_data;
table_data.reserve(devices.size());
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
const std::string template_host = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";

table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});

constexpr size_t MiB = 1024 * 1024;
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};

// track seen host buffer types to avoid double counting:
std::set<ggml_backend_buffer_type_t> seen_host_buffer_types;

// GPU devices have their own memory, print a breakdown for each GPU on a single line:
for (const ggml_backend_dev_t & dev : devices) {
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
continue;
}
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev);
const llama_memory_breakdown_data & mb = memory_breakdown[buft];

const std::string name = ggml_backend_buft_name(buft);
std::string desc = ggml_backend_dev_description(dev);
for (const std::string & prefix : desc_prefixes_strip) {
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
desc = desc.substr(prefix.length());
}
}

size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);

const size_t self = mb.model + mb.context + mb.compute;
const size_t unaccounted = total - self - free;

table_data.push_back({
template_gpu,
" - " + name + " (" + desc + ")",
std::to_string(total / MiB),
std::to_string(free / MiB),
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
std::to_string(unaccounted / MiB)});
seen_host_buffer_types.insert(buft);
}

// "free" host memory is poorly defined, instead track only memory that we know is being used:
llama_memory_breakdown_data mb_host = {0, 0, 0};

// consolidate all memory buffers not on any of the models GPU devices as host memory:
for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (seen_host_buffer_types.count(buft) == 1) {
continue;
}
mb_host.model += mb.model;
mb_host.context += mb.context;
mb_host.compute += mb.compute;
seen_host_buffer_types.insert(buft);
}

const size_t self_host = mb_host.model + mb_host.context + mb_host.compute;
table_data.push_back({
template_host,
" - Host",
"", // total
"", // free
std::to_string(self_host / MiB),
std::to_string(mb_host.model / MiB),
std::to_string(mb_host.context / MiB),
std::to_string(mb_host.compute / MiB),
""}); // unaccounted

for (size_t j = 1; j < table_data[0].size(); j++) {
size_t max_len = 0;
for (const auto & td : table_data) {
max_len = std::max(max_len, td[j].length());
}
for (auto & td : table_data) {
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
}
}
for (const auto & td : table_data) {
LLAMA_LOG_INFO(td[0].c_str(),
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
td[6].c_str(), td[7].c_str(), td[8].c_str());
}
}

//
// training
//
Expand Down
10 changes: 10 additions & 0 deletions src/llama-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,17 @@ class llama_batch_allocr;
class llama_io_read_i;
class llama_io_write_i;

// "memory" as in abstract memory for the context
struct llama_memory_i;
struct llama_memory_context_i;

// "memory" as in physical memory for a buffer type, in bytes
struct llama_memory_breakdown_data {
size_t model = 0; // memory allocated for the model
size_t context = 0; // memory allocated for the context
size_t compute = 0; // memory allocated for temporary compute buffers
};

struct llama_context {
// init scheduler and compute buffers, reserve worst-case graphs
llama_context(
Expand Down Expand Up @@ -144,6 +152,8 @@ struct llama_context {
llama_perf_context_data perf_get_data() const;
void perf_reset();

std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;

//
// training
//
Expand Down
8 changes: 8 additions & 0 deletions src/llama-kv-cache-iswa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
return kv_swa->seq_pos_max(seq_id);
}

std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
for (const auto & buft_size : kv_swa->memory_breakdown()) {
mb[buft_size.first] += buft_size.second;
}
return mb;
}

llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
GGML_UNUSED(embd_all);

Expand Down
2 changes: 2 additions & 0 deletions src/llama-kv-cache-iswa.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ class llama_kv_cache_iswa : public llama_memory_i {
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;

std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;

// state write/load

void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
Expand Down
8 changes: 8 additions & 0 deletions src/llama-kv-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
return cells.seq_pos_max(seq_id);
}

std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
}
return ret;
}

llama_memory_context_ptr llama_kv_cache::init_batch(
llama_batch_allocr & balloc,
uint32_t n_ubatch,
Expand Down
2 changes: 2 additions & 0 deletions src/llama-kv-cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ class llama_kv_cache : public llama_memory_i {
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;

std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;

// state write/load

void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
Expand Down
8 changes: 8 additions & 0 deletions src/llama-memory-hybrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
}

std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
for (const auto & buft_size : mem_recr->memory_breakdown()) {
mb[buft_size.first] += buft_size.second;
}
return mb;
}

void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
GGML_UNUSED(flags);

Expand Down
2 changes: 2 additions & 0 deletions src/llama-memory-hybrid.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class llama_memory_hybrid : public llama_memory_i {
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;

std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;

// state write/load

void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
Expand Down
9 changes: 9 additions & 0 deletions src/llama-memory-recurrent.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "llama-memory-recurrent.h"

#include "ggml-alloc.h"
#include "llama-impl.h"
#include "llama-io.h"
#include "llama-batch.h"
Expand Down Expand Up @@ -359,6 +360,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
return result;
}

std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
}
return ret;
}

llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
do {
balloc.split_reset();
Expand Down
3 changes: 3 additions & 0 deletions src/llama-memory-recurrent.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "llama-graph.h"
#include "llama-memory.h"

#include <map>
#include <set>
#include <vector>

Expand Down Expand Up @@ -50,6 +51,8 @@ class llama_memory_recurrent : public llama_memory_i {
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;

std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;

bool prepare(const std::vector<llama_ubatch> & ubatches);

// find a contiguous slot of memory cells and emplace the ubatch there
Expand Down
3 changes: 3 additions & 0 deletions src/llama-memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "llama.h"

#include <map>
#include <memory>
#include <functional>

Expand Down Expand Up @@ -108,6 +109,8 @@ struct llama_memory_i {
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;

virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;

//
// state write/read
//
Expand Down
8 changes: 8 additions & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5903,6 +5903,14 @@ size_t llama_model::n_devices() const {
return devices.size();
}

std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
}
return ret;
}

uint64_t llama_model::n_elements() const {
return pimpl->n_elements;
}
Expand Down
Loading
Loading