Skip to content

Commit 5bc7a74

Browse files
llama: automatically set runtime pars to fit VRAM
1 parent 5787b5d commit 5bc7a74

20 files changed

+376
-84
lines changed

common/common.cpp

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -885,9 +885,78 @@ std::string fs_get_cache_file(const std::string & filename) {
885885
// Model utils
886886
//
887887

888+
static void common_fit_to_free_memory(
889+
const std::string & path_model, llama_model_params & mparams, llama_context_params & cparams, const size_t margin) {
890+
891+
std::vector<ggml_backend_dev_t> devices(ggml_backend_dev_count());
892+
for (size_t i = 0; i < devices.size(); i++) {
893+
devices[i] = ggml_backend_dev_get(i);
894+
}
895+
896+
std::vector<size_t> memory_total(devices.size());
897+
std::vector<size_t> memory_free(devices.size());
898+
for (size_t i = 0; i < devices.size(); i++) {
899+
ggml_backend_dev_memory(devices[i], memory_free.data() + i, memory_total.data() + i);
900+
}
901+
902+
auto get_min_margin = [path_model, memory_free](const llama_model_params & mparams_test, const llama_context_params & cparams_test) {
903+
std::vector<size_t> memory_expect(memory_free.size());
904+
GGML_ASSERT(llama_expected_memory_use(path_model.c_str(), mparams_test, cparams_test, memory_expect.data()));
905+
906+
int64_t min_margin = INT64_MAX;
907+
for (size_t i = 0; i < memory_free.size(); i++) {
908+
min_margin = std::min(min_margin, int64_t(memory_free[i]) - int64_t(memory_expect[i]));
909+
}
910+
return min_margin;
911+
};
912+
auto test_ngl = [mparams, cparams, get_min_margin](const int ngl) {
913+
llama_model_params mparams_test = mparams;
914+
mparams_test.n_gpu_layers = ngl;
915+
return get_min_margin(mparams_test, cparams);
916+
};
917+
918+
int ngl_low = 0;
919+
int64_t margin_low = test_ngl(ngl_low);
920+
if (margin_low < int64_t(margin)) {
921+
mparams.n_gpu_layers = ngl_low;
922+
return;
923+
}
924+
925+
int ngl_high = 128; // FIXME
926+
int64_t margin_high = test_ngl(ngl_high);
927+
if (margin_high >= int64_t(margin)) {
928+
mparams.n_gpu_layers = ngl_high;
929+
return;
930+
}
931+
932+
// TODO bisection is ineffient, better to interpolate if max ngl value is known
933+
while (ngl_high - ngl_low > 1) {
934+
const int ngl_test = (ngl_high + ngl_low) / 2;
935+
const int64_t margin_test = test_ngl(ngl_test);
936+
937+
if (margin_test < int64_t(margin)) {
938+
ngl_high = ngl_test;
939+
margin_high = margin_test;
940+
} else {
941+
ngl_low = ngl_test;
942+
margin_low = margin_test;
943+
}
944+
}
945+
946+
if (margin_high >= int64_t(margin)) {
947+
mparams.n_gpu_layers = ngl_high;
948+
} else {
949+
mparams.n_gpu_layers = ngl_low;
950+
}
951+
}
952+
888953
struct common_init_result common_init_from_params(common_params & params) {
889954
common_init_result iparams;
890955
auto mparams = common_model_params_to_llama(params);
956+
auto cparams = common_context_params_to_llama(params);
957+
958+
constexpr size_t margin = 1024*1024*1024;
959+
common_fit_to_free_memory(params.model.path, mparams, cparams, margin);
891960

892961
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
893962
if (model == NULL) {
@@ -925,8 +994,6 @@ struct common_init_result common_init_from_params(common_params & params) {
925994
}
926995
}
927996

928-
auto cparams = common_context_params_to_llama(params);
929-
930997
llama_context * lctx = llama_init_from_model(model, cparams);
931998
if (lctx == NULL) {
932999
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());

ggml/include/ggml-alloc.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ extern "C" {
99
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
1010
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
1111
typedef struct ggml_backend * ggml_backend_t;
12+
typedef struct ggml_backend_device * ggml_backend_dev_t;
1213

1314
// Tensor allocator
1415
struct ggml_tallocr {
@@ -58,16 +59,19 @@ GGML_API bool ggml_gallocr_reserve_n(
5859
ggml_gallocr_t galloc,
5960
struct ggml_cgraph * graph,
6061
const int * node_buffer_ids,
61-
const int * leaf_buffer_ids);
62+
const int * leaf_buffer_ids,
63+
bool dry_run);
6264

6365
// automatic reallocation if the topology changes when using a single buffer
6466
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
6567
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
6668

6769
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
70+
size_t ggml_gallocr_get_max_size(ggml_gallocr_t galloc, ggml_backend_dev_t dev);
6871

6972
// Utils
7073
// Create a buffer and allocate all the tensors in a ggml_context
74+
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
7175
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
7276
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
7377

ggml/include/ggml-backend.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ extern "C" {
293293
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
294294

295295
// Initialize backend buffers from a measure graph
296+
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes); // result per backend is written to sizes
296297
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
297298

298299
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);

ggml/src/ggml-alloc.c

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
150150
}
151151
#endif
152152

153+
// returns the offset for the allocation
153154
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
154155
size = aligned_offset(NULL, size, alloc->alignment);
155156

@@ -472,7 +473,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
472473
}
473474

474475
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
475-
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
476+
return t->data != NULL // tensor data already set externally
477+
|| t->buffer // tensor on external buffer (but may not yet be allocated)
478+
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
476479
}
477480

478481
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
@@ -670,7 +673,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
670673
}
671674
}
672675

673-
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
676+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph,
677+
const int * node_buffer_ids, const int * leaf_buffer_ids, bool dry_run) {
674678
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
675679
// add 25% margin to avoid hash collisions
676680
min_hash_size += min_hash_size / 4;
@@ -768,7 +772,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
768772
#endif
769773

770774
ggml_backend_buffer_free(galloc->buffers[i]);
771-
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
775+
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], dry_run ? 0 : new_size);
772776
if (galloc->buffers[i] == NULL) {
773777
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
774778
return false;
@@ -781,7 +785,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
781785
}
782786

783787
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
784-
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
788+
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL, /*dry_run =*/ false);
785789
}
786790

787791
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
@@ -934,6 +938,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
934938
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
935939
}
936940

941+
size_t ggml_gallocr_get_max_size(ggml_gallocr_t galloc, ggml_backend_dev_t dev) {
942+
for (int i = 0; i < galloc->n_buffers; i++) {
943+
if (ggml_backend_buft_get_device(galloc->bufts[i]) == dev) {
944+
return ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
945+
}
946+
}
947+
return 0;
948+
}
949+
937950
// utils
938951

939952
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
@@ -984,14 +997,16 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
984997
return true;
985998
}
986999

987-
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1000+
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
1001+
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool dry_run) {
9881002
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
9891003

9901004
size_t alignment = ggml_backend_buft_get_alignment(buft);
9911005
size_t max_size = ggml_backend_buft_get_max_size(buft);
9921006

9931007
ggml_backend_buffer_t * buffers = NULL;
9941008
size_t n_buffers = 0;
1009+
*nbytes_total = 0;
9951010

9961011
size_t cur_buf_size = 0;
9971012
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
@@ -1003,10 +1018,13 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
10031018

10041019
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
10051020
// allocate tensors in the current buffer
1006-
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1007-
return NULL;
1021+
if (!dry_run) {
1022+
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1023+
return NULL;
1024+
}
10081025
}
10091026
first = t;
1027+
*nbytes_total += cur_buf_size;
10101028
cur_buf_size = this_size;
10111029
} else {
10121030
cur_buf_size += this_size;
@@ -1015,15 +1033,23 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
10151033

10161034
// allocate remaining tensors
10171035
if (cur_buf_size > 0) {
1018-
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
1019-
return NULL;
1036+
*nbytes_total += cur_buf_size;
1037+
if (!dry_run) {
1038+
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
1039+
return NULL;
1040+
}
10201041
}
10211042
}
10221043

1044+
if (dry_run) {
1045+
return NULL;
1046+
}
1047+
10231048
if (n_buffers == 0) {
10241049
#ifndef NDEBUG
10251050
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
10261051
#endif
1052+
GGML_ASSERT(!buffers);
10271053
return NULL;
10281054
}
10291055

@@ -1033,10 +1059,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
10331059
} else {
10341060
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
10351061
}
1036-
free(buffers);
1062+
if (buffers) {
1063+
free(buffers); // can be NULL if dry_run or context is empty
1064+
}
10371065
return buffer;
10381066
}
10391067

1068+
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1069+
size_t nbytes_total = 0;
1070+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*dry_run =*/ true);
1071+
GGML_ASSERT(!buf);
1072+
return nbytes_total;
1073+
}
1074+
1075+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1076+
size_t nbytes_total = 0;
1077+
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*dry_run =*/ false);
1078+
}
1079+
10401080
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
10411081
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
10421082
}

ggml/src/ggml-backend.cpp

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1347,7 +1347,8 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
13471347
#ifndef NDEBUG
13481348
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
13491349
#endif
1350-
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1350+
ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
1351+
sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ false);
13511352
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
13521353
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
13531354
return false;
@@ -1546,14 +1547,31 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
15461547
sched->is_alloc = false;
15471548
}
15481549

1550+
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
1551+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1552+
1553+
ggml_backend_sched_split_graph(sched, measure_graph);
1554+
1555+
ggml_backend_sched_synchronize(sched);
1556+
1557+
GGML_ASSERT(ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
1558+
sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ true));
1559+
for (int ib = 0; ib < sched->n_backends; ib++) {
1560+
sizes[ib] = ggml_gallocr_get_max_size(sched->galloc, ggml_backend_get_device(sched->backends[ib]));
1561+
}
1562+
1563+
ggml_backend_sched_reset(sched);
1564+
}
1565+
15491566
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
15501567
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
15511568

15521569
ggml_backend_sched_split_graph(sched, measure_graph);
15531570

15541571
ggml_backend_sched_synchronize(sched);
15551572

1556-
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1573+
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
1574+
sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ false)) {
15571575
return false;
15581576
}
15591577

include/llama.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,13 @@ extern "C" {
414414
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
415415
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
416416

417+
// returns success
418+
LLAMA_API bool llama_expected_memory_use(
419+
const char * path_model,
420+
struct llama_model_params mparams,
421+
struct llama_context_params cparams,
422+
size_t * nbytes_expect);
423+
417424
// Initialize the llama + ggml backend
418425
// If numa is true, use NUMA optimizations
419426
// Call once at the start of the program

0 commit comments

Comments
 (0)