Skip to content

Commit 00fb12b

Browse files
llama: automatically fit args to free memory
1 parent fe6a988 commit 00fb12b

20 files changed

+776
-51
lines changed

common/arg.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <nlohmann/json.hpp>
2121

2222
#include <algorithm>
23+
#include <cinttypes>
2324
#include <climits>
2425
#include <cstdarg>
2526
#include <filesystem>
@@ -1434,7 +1435,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
14341435
params.kv_overrides.back().key[0] = 0;
14351436
}
14361437

1437-
if (!params.tensor_buft_overrides.empty()) {
1438+
// pad tensor_buft_overrides for llama_params_fit:
1439+
const size_t ntbo = llama_max_tensor_buft_overrides();
1440+
while (params.tensor_buft_overrides.size() < ntbo) {
14381441
params.tensor_buft_overrides.push_back({nullptr, nullptr});
14391442
}
14401443

@@ -2961,6 +2964,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29612964
}
29622965
}
29632966
).set_env("LLAMA_ARG_MAIN_GPU"));
2967+
add_opt(common_arg(
2968+
{ "-fit", "--fit" }, "[on|off]",
2969+
string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
2970+
[](common_params & params, const std::string & value) {
2971+
if (is_truthy(value)) {
2972+
params.fit_params = true;
2973+
} else if (is_falsey(value)) {
2974+
params.fit_params = false;
2975+
} else {
2976+
throw std::runtime_error(
2977+
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
2978+
}
2979+
}
2980+
).set_env("LLAMA_ARG_FIT"));
2981+
add_opt(common_arg(
2982+
{ "-fitm", "--fit-margin" }, "MiB",
2983+
string_format("target margin per device for --fit option, default: %zu", params.fit_params_margin/(1024*1024)),
2984+
[](common_params & params, int value) {
2985+
params.fit_params_margin = value * size_t(1024*1024);
2986+
}
2987+
).set_env("LLAMA_ARG_FIT_MARGIN"));
2988+
add_opt(common_arg(
2989+
{ "-fitc", "--fit-ctx" }, "N",
2990+
string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
2991+
[](common_params & params, int value) {
2992+
params.fit_params_min_ctx = value;
2993+
}
2994+
).set_env("LLAMA_ARG_FIT_CTX"));
29642995
add_opt(common_arg(
29652996
{"--check-tensors"},
29662997
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),

common/common.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,19 @@ std::string fs_get_cache_file(const std::string & filename) {
916916
struct common_init_result common_init_from_params(common_params & params) {
917917
common_init_result iparams;
918918
auto mparams = common_model_params_to_llama(params);
919+
auto cparams = common_context_params_to_llama(params);
920+
921+
if (params.fit_params) {
922+
const bool fit_successful = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
923+
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_margin, params.fit_params_min_ctx,
924+
params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
925+
926+
if (fit_successful) {
927+
LOG_INF("%s: successfully fit parameters to device memory\n", __func__);
928+
} else {
929+
LOG_WRN("%s: failed to fit parameters to device memory, may crash during allocation\n", __func__);
930+
}
931+
}
919932

920933
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
921934
if (model == NULL) {
@@ -926,8 +939,6 @@ struct common_init_result common_init_from_params(common_params & params) {
926939

927940
const llama_vocab * vocab = llama_model_get_vocab(model);
928941

929-
auto cparams = common_context_params_to_llama(params);
930-
931942
llama_context * lctx = llama_init_from_model(model, cparams);
932943
if (lctx == NULL) {
933944
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",

common/common.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,8 @@ struct lr_opt {
274274
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
275275

276276
struct common_params {
277-
int32_t n_predict = -1; // new tokens to predict
278-
int32_t n_ctx = 4096; // context size
277+
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
278+
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
279279
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
280280
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
281281
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -296,9 +296,12 @@ struct common_params {
296296
// offload params
297297
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
298298

299-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
300-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
301-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
299+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
300+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
301+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
302+
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
303+
size_t fit_params_margin = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
304+
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
302305

303306
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
304307

ggml/include/ggml-alloc.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
5353
// call with a worst-case graph to avoid buffer reallocations
5454
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
5555
// returns false if the buffer allocation failed
56+
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
5657
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
58+
GGML_API void ggml_gallocr_reserve_n_size(
59+
ggml_gallocr_t galloc,
60+
struct ggml_cgraph * graph,
61+
const int * node_buffer_ids,
62+
const int * leaf_buffer_ids,
63+
size_t * sizes);
5764
GGML_API bool ggml_gallocr_reserve_n(
5865
ggml_gallocr_t galloc,
5966
struct ggml_cgraph * graph,
@@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
6875

6976
// Utils
7077
// Create a buffer and allocate all the tensors in a ggml_context
78+
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
79+
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
7180
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
7281
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
7382

ggml/include/ggml-backend.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ extern "C" {
307307
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
308308

309309
// Initialize backend buffers from a measure graph
310+
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
310311
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
311312

312313
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);

ggml/include/ggml.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2509,7 +2509,8 @@ extern "C" {
25092509

25102510
// Set callback for all future logging events.
25112511
// If this is not called, or NULL is supplied, everything is output on stderr.
2512-
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2512+
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
2513+
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
25132514

25142515
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
25152516

ggml/src/ggml-alloc.c

Lines changed: 57 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -595,7 +595,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
595595
}
596596

597597
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
598-
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
598+
return t->data != NULL // tensor data already set externally
599+
|| t->buffer // tensor on external buffer (but not yet allocated)
600+
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
599601
}
600602

601603
// free the extra space at the end if the new tensor is smaller
@@ -813,7 +815,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
813815
}
814816
}
815817

816-
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
818+
static bool ggml_gallocr_reserve_n_impl(
819+
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
817820
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
818821
// add 25% margin to avoid hash collisions
819822
min_hash_size += min_hash_size / 4;
@@ -915,21 +918,41 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
915918
if (realloc) {
916919
#ifndef NDEBUG
917920
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
918-
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
921+
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
922+
__func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
919923
#endif
920924

921925
ggml_vbuffer_free(galloc->buffers[i]);
922-
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
923-
if (galloc->buffers[i] == NULL) {
924-
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
925-
return false;
926+
if (no_alloc) {
927+
galloc->buffers[i] = NULL;
928+
} else {
929+
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
930+
if (galloc->buffers[i] == NULL) {
931+
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
932+
return false;
933+
}
926934
}
927935
}
928936
}
929937

930938
return true;
931939
}
932940

941+
void ggml_gallocr_reserve_n_size(
942+
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
943+
GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
944+
for (int i = 0; i < galloc->n_buffers; i++) {
945+
sizes[i] = 0;
946+
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
947+
sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
948+
}
949+
}
950+
}
951+
952+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
953+
return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
954+
}
955+
933956
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
934957
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
935958
}
@@ -1132,14 +1155,16 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
11321155
return true;
11331156
}
11341157

1135-
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1158+
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
1159+
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
11361160
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
11371161

11381162
size_t alignment = ggml_backend_buft_get_alignment(buft);
11391163
size_t max_size = ggml_backend_buft_get_max_size(buft);
11401164

11411165
ggml_backend_buffer_t * buffers = NULL;
11421166
size_t n_buffers = 0;
1167+
*nbytes_total = 0;
11431168

11441169
size_t cur_buf_size = 0;
11451170
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
@@ -1151,10 +1176,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
11511176

11521177
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
11531178
// allocate tensors in the current buffer
1154-
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1179+
if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
11551180
return NULL;
11561181
}
11571182
first = t;
1183+
*nbytes_total += cur_buf_size;
11581184
cur_buf_size = this_size;
11591185
} else {
11601186
cur_buf_size += this_size;
@@ -1163,15 +1189,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
11631189

11641190
// allocate remaining tensors
11651191
if (cur_buf_size > 0) {
1166-
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
1192+
*nbytes_total += cur_buf_size;
1193+
if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
11671194
return NULL;
11681195
}
11691196
}
11701197

1198+
if (no_alloc) {
1199+
return NULL;
1200+
}
1201+
11711202
if (n_buffers == 0) {
11721203
#ifndef NDEBUG
11731204
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
11741205
#endif
1206+
GGML_ASSERT(!buffers);
11751207
return NULL;
11761208
}
11771209

@@ -1181,10 +1213,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
11811213
} else {
11821214
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
11831215
}
1184-
free(buffers);
1216+
if (buffers) {
1217+
free(buffers); // can be NULL if context is empty or no_alloc
1218+
}
11851219
return buffer;
11861220
}
11871221

1222+
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1223+
size_t nbytes_total = 0;
1224+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
1225+
GGML_ASSERT(!buf);
1226+
return nbytes_total;
1227+
}
1228+
1229+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1230+
size_t nbytes_total = 0;
1231+
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
1232+
}
1233+
11881234
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
11891235
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
11901236
}

ggml/src/ggml-backend.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
3636
}
3737

3838
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
39+
GGML_ASSERT(buft);
3940
if (size == 0) {
4041
// return a dummy buffer for zero-sized allocations
4142
return ggml_backend_buffer_init(buft, {}, NULL, 0);
4243
}
43-
44-
GGML_ASSERT(buft);
4544
return buft->iface.alloc_buffer(buft, size);
4645
}
4746

@@ -1694,6 +1693,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
16941693
sched->is_alloc = false;
16951694
}
16961695

1696+
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
1697+
GGML_ASSERT(sched);
1698+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1699+
GGML_ASSERT(sizes);
1700+
1701+
ggml_backend_sched_reset(sched);
1702+
1703+
ggml_backend_sched_synchronize(sched);
1704+
1705+
ggml_backend_sched_split_graph(sched, measure_graph);
1706+
1707+
ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
1708+
}
1709+
16971710
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
16981711
GGML_ASSERT(sched);
16991712
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);

ggml/src/ggml.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7345,6 +7345,11 @@ size_t ggml_quantize_chunk(
73457345

73467346
////////////////////////////////////////////////////////////////////////////////
73477347

7348+
void ggml_log_get(ggml_log_callback * log_callback, void ** user_data) {
7349+
*log_callback = g_logger_state.log_callback;
7350+
*user_data = g_logger_state.log_callback_user_data;
7351+
}
7352+
73487353
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
73497354
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
73507355
g_logger_state.log_callback_user_data = user_data;

include/llama.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,7 @@ extern "C" {
297297
bool check_tensors; // validate model tensor data
298298
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
299299
bool no_host; // bypass host buffer allowing extra buffers to be used
300+
bool no_alloc; // only load metadata and simulate memory allocations
300301
};
301302

302303
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -450,10 +451,23 @@ extern "C" {
450451
// Frees all allocated memory
451452
LLAMA_API void llama_free(struct llama_context * ctx);
452453

454+
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
455+
// returns true if the parameters could be successfully modified to fit device memory
456+
LLAMA_API bool llama_params_fit(
457+
const char * path_model,
458+
struct llama_model_params * mparams,
459+
struct llama_context_params * cparams,
460+
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
461+
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
462+
size_t margin, // margin of memory to leave per device in bytes
463+
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
464+
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
465+
453466
LLAMA_API int64_t llama_time_us(void);
454467

455468
LLAMA_API size_t llama_max_devices(void);
456469
LLAMA_API size_t llama_max_parallel_sequences(void);
470+
LLAMA_API size_t llama_max_tensor_buft_overrides(void);
457471

458472
LLAMA_API bool llama_supports_mmap (void);
459473
LLAMA_API bool llama_supports_mlock (void);
@@ -1332,7 +1346,8 @@ extern "C" {
13321346

13331347
// Set callback for all future logging events.
13341348
// If this is not called, or NULL is supplied, everything is output on stderr.
1335-
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
1349+
LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
1350+
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
13361351

13371352
//
13381353
// Performance utils

0 commit comments

Comments
 (0)