diff --git a/common/arg.cpp b/common/arg.cpp index a465eb36234e7..836f5e80e130e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -1434,7 +1435,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context params.kv_overrides.back().key[0] = 0; } - if (!params.tensor_buft_overrides.empty()) { + // pad tensor_buft_overrides for llama_params_fit: + const size_t ntbo = llama_max_tensor_buft_overrides(); + while (params.tensor_buft_overrides.size() < ntbo) { params.tensor_buft_overrides.push_back({nullptr, nullptr}); } @@ -2961,6 +2964,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_env("LLAMA_ARG_MAIN_GPU")); + add_opt(common_arg( + { "-fit", "--fit" }, "[on|off]", + string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"), + [](common_params & params, const std::string & value) { + if (is_truthy(value)) { + params.fit_params = true; + } else if (is_falsey(value)) { + params.fit_params = false; + } else { + throw std::runtime_error( + string_format("error: unkown value for --fit: '%s'\n", value.c_str())); + } + } + ).set_env("LLAMA_ARG_FIT")); + add_opt(common_arg( + { "-fitm", "--fit-margin" }, "MiB", + string_format("target margin per device for --fit option, default: %zu", params.fit_params_margin/(1024*1024)), + [](common_params & params, int value) { + params.fit_params_margin = value * size_t(1024*1024); + } + ).set_env("LLAMA_ARG_FIT_MARGIN")); + add_opt(common_arg( + { "-fitc", "--fit-ctx" }, "N", + string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx), + [](common_params & params, int value) { + params.fit_params_min_ctx = value; + } + ).set_env("LLAMA_ARG_FIT_CTX")); add_opt(common_arg( {"--check-tensors"}, string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), diff --git a/common/common.cpp b/common/common.cpp index b0591e84b0668..b60655f8c12e0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -916,6 +916,19 @@ std::string fs_get_cache_file(const std::string & filename) { struct common_init_result common_init_from_params(common_params & params) { common_init_result iparams; auto mparams = common_model_params_to_llama(params); + auto cparams = common_context_params_to_llama(params); + + if (params.fit_params) { + const bool fit_successful = llama_params_fit(params.model.path.c_str(), &mparams, &cparams, + params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_margin, params.fit_params_min_ctx, + params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + + if (fit_successful) { + LOG_INF("%s: successfully fit parameters to device memory\n", __func__); + } else { + LOG_WRN("%s: failed to fit parameters to device memory, may crash during allocation\n", __func__); + } + } llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); if (model == NULL) { @@ -926,8 +939,6 @@ struct common_init_result common_init_from_params(common_params & params) { const llama_vocab * vocab = llama_model_get_vocab(model); - auto cparams = common_context_params_to_llama(params); - llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", diff --git a/common/common.h b/common/common.h index a8cb630ea5805..15ef3651c7cbc 100644 --- a/common/common.h +++ b/common/common.h @@ -274,8 +274,8 @@ struct lr_opt { struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata); struct common_params { - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 4096; // context size + int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit + int32_t n_ctx = 0; // context size, 0 == context the model was trained with int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt @@ -296,9 +296,12 @@ struct common_params { // offload params std::vector devices; // devices to use for offloading - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + bool fit_params = true; // whether to fit unset model/context parameters to free device memory + size_t fit_params_margin = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory + int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h index 2cb150fd2a313..78aa059dde380 100644 --- a/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h @@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); // call with a worst-case graph to avoid buffer reallocations // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed // returns false if the buffer allocation failed +// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +GGML_API void ggml_gallocr_reserve_n_size( + ggml_gallocr_t galloc, + struct ggml_cgraph * graph, + const int * node_buffer_ids, + const int * leaf_buffer_ids, + size_t * sizes); GGML_API bool ggml_gallocr_reserve_n( ggml_gallocr_t galloc, struct ggml_cgraph * graph, @@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i // Utils // Create a buffer and allocate all the tensors in a ggml_context +// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft +GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index f1b740785914e..4ed5f35774ffc 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -307,6 +307,7 @@ extern "C" { GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph + GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes); GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d948b00cc7f30..c5b378e9e5fde 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2509,7 +2509,8 @@ extern "C" { // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. - GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); + GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data); + GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 91aff205f1832..45f014d846ba0 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -602,7 +602,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) { } static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { - return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; + return t->data != NULL // tensor data already set externally + || t->buffer // tensor on external buffer (but not yet allocated) + || ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc } // free the extra space at the end if the new tensor is smaller @@ -820,7 +822,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } } -bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { +static bool ggml_gallocr_reserve_n_impl( + ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) { size_t min_hash_size = graph->n_nodes + graph->n_leafs; // add 25% margin to avoid hash collisions min_hash_size += min_hash_size / 4; @@ -922,14 +925,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c if (realloc) { #ifndef NDEBUG size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0; - GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", + __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif ggml_vbuffer_free(galloc->buffers[i]); - galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); - if (galloc->buffers[i] == NULL) { - GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); - return false; + if (no_alloc) { + galloc->buffers[i] = NULL; + } else { + galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); + if (galloc->buffers[i] == NULL) { + GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); + return false; + } } } } @@ -937,6 +945,21 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c return true; } +void ggml_gallocr_reserve_n_size( + ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) { + GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true)); + for (int i = 0; i < galloc->n_buffers; i++) { + sizes[i] = 0; + for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) { + sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size; + } + } +} + +bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { + return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false); +} + bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); } @@ -1139,7 +1162,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx, return true; } -ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { +static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl( + struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) { GGML_ASSERT(ggml_get_no_alloc(ctx) == true); size_t alignment = ggml_backend_buft_get_alignment(buft); @@ -1147,6 +1171,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte ggml_backend_buffer_t * buffers = NULL; size_t n_buffers = 0; + *nbytes_total = 0; size_t cur_buf_size = 0; struct ggml_tensor * first = ggml_get_first_tensor(ctx); @@ -1158,10 +1183,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) { // allocate tensors in the current buffer - if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { + if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { return NULL; } first = t; + *nbytes_total += cur_buf_size; cur_buf_size = this_size; } else { cur_buf_size += this_size; @@ -1170,15 +1196,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte // allocate remaining tensors if (cur_buf_size > 0) { - if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { + *nbytes_total += cur_buf_size; + if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { return NULL; } } + if (no_alloc) { + return NULL; + } + if (n_buffers == 0) { #ifndef NDEBUG GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); #endif + GGML_ASSERT(!buffers); return NULL; } @@ -1188,10 +1220,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } else { buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers); } - free(buffers); + if (buffers) { + free(buffers); // can be NULL if context is empty or no_alloc + } return buffer; } +size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { + size_t nbytes_total = 0; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true); + GGML_ASSERT(!buf); + return nbytes_total; +} + +ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { + size_t nbytes_total = 0; + return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false); +} + ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) { return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend)); } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index ff9135fe2d878..a4507da93363c 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { } ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + GGML_ASSERT(buft); if (size == 0) { // return a dummy buffer for zero-sized allocations return ggml_backend_buffer_init(buft, {}, NULL, 0); } - - GGML_ASSERT(buft); return buft->iface.alloc_buffer(buft, size); } @@ -1694,6 +1693,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { sched->is_alloc = false; } +void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) { + GGML_ASSERT(sched); + GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); + GGML_ASSERT(sizes); + + ggml_backend_sched_reset(sched); + + ggml_backend_sched_synchronize(sched); + + ggml_backend_sched_split_graph(sched, measure_graph); + + ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes); +} + bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { GGML_ASSERT(sched); GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9be35c1be8456..bb53290fdacea 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7345,6 +7345,11 @@ size_t ggml_quantize_chunk( //////////////////////////////////////////////////////////////////////////////// +void ggml_log_get(ggml_log_callback * log_callback, void ** user_data) { + *log_callback = g_logger_state.log_callback; + *user_data = g_logger_state.log_callback_user_data; +} + void ggml_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback_user_data = user_data; diff --git a/include/llama.h b/include/llama.h index a0a660bff88da..d54a9b62ee96b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -297,6 +297,7 @@ extern "C" { bool check_tensors; // validate model tensor data bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool no_host; // bypass host buffer allowing extra buffers to be used + bool no_alloc; // only load metadata and simulate memory allocations }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations @@ -450,10 +451,23 @@ extern "C" { // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); + // fits mparams and cparams to free device memory (assumes system memory is unlimited) + // returns true if the parameters could be successfully modified to fit device memory + LLAMA_API bool llama_params_fit( + const char * path_model, + struct llama_model_params * mparams, + struct llama_context_params * cparams, + float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements + struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements + size_t margin, // margin of memory to leave per device in bytes + uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use + enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log + LLAMA_API int64_t llama_time_us(void); LLAMA_API size_t llama_max_devices(void); LLAMA_API size_t llama_max_parallel_sequences(void); + LLAMA_API size_t llama_max_tensor_buft_overrides(void); LLAMA_API bool llama_supports_mmap (void); LLAMA_API bool llama_supports_mlock (void); @@ -1332,7 +1346,8 @@ extern "C" { // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. - LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); + LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data); + LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); // // Performance utils diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f6192a36e0ee5..7c789a46bc1e6 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -208,6 +208,7 @@ llama_context::llama_context( backend_buft.clear(); backend_ptrs.clear(); + backend_buf_exp_size.clear(); for (auto & backend : backends) { auto * buft = ggml_backend_get_default_buffer_type(backend.get()); @@ -224,6 +225,7 @@ llama_context::llama_context( backend_buft.push_back(buft); backend_ptrs.push_back(backend.get()); + backend_buf_exp_size.push_back(0); } LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size()); @@ -339,7 +341,8 @@ llama_context::llama_context( // reserve pp (prompt processing) graph first so that buffers are only allocated once { - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), + model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr); if (!gf) { if (pipeline_parallel) { LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__); @@ -357,7 +360,7 @@ llama_context::llama_context( // reserve with tg (token generation) graph to get the number of splits and nodes { - auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get()); + auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc); if (!gf) { throw std::runtime_error("failed to allocate compute tg buffers"); } @@ -372,7 +375,7 @@ llama_context::llama_context( // // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get()); // - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); } @@ -381,11 +384,13 @@ llama_context::llama_context( for (size_t i = 0; i < backend_ptrs.size(); ++i) { ggml_backend_t backend = backend_ptrs[i]; ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); - if (size > 1) { + if (!model.hparams.no_alloc) { + backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend); + } + if (backend_buf_exp_size[i] > 1) { LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); + backend_buf_exp_size[i] / 1024.0 / 1024.0); } } @@ -404,6 +409,22 @@ llama_context::llama_context( } llama_context::~llama_context() { + if (!model.hparams.no_alloc) { + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; + ggml_backend_buffer_type_t buft = backend_buft[i]; + + const size_t size_exp = backend_buf_exp_size[i]; + const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (size_exp == size_act) { + LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n", + __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); + } else { + LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n", + __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); + } + } + } ggml_opt_free(opt_ctx); } @@ -1374,7 +1395,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const { return static_cast(gf_res_reserve.get()); } -ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) { +ggml_cgraph * llama_context::graph_reserve( + uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) { LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); GGML_ASSERT(n_outputs >= 1); @@ -1411,8 +1433,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u // initialize scheduler with the specified graph if (split_only) { - ggml_backend_sched_split_graph(sched.get(), gf); + if (sizes) { + ggml_backend_sched_reserve_size(sched.get(), gf, sizes); + } else { + ggml_backend_sched_split_graph(sched.get(), gf); + } } else if (!ggml_backend_sched_reserve(sched.get(), gf)) { + GGML_ASSERT(!sizes); LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); return nullptr; } @@ -2034,15 +2061,26 @@ void llama_context::perf_reset() { std::map llama_context::memory_breakdown() const { std::map ret; - for (const auto & buft_size : model.memory_breakdown()) { - ret[buft_size.first].model += buft_size.second; + for (const auto & [buft, size] : model.memory_breakdown()) { + ret[buft].model += size; } - for (const auto & buft_size : memory->memory_breakdown()) { - ret[buft_size.first].context += buft_size.second; + if (memory) { + for (const auto & [buft, size] : memory->memory_breakdown()) { + ret[buft].context += size; + } } - for (const auto & backend_ptr : backends) { - ggml_backend_t backend = backend_ptr.get(); - ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (model.hparams.no_alloc) { + for (size_t i = 0; i < backends.size(); ++i) { + ggml_backend_t backend = backends[i].get(); + ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend); + ret[buft].compute += backend_buf_exp_size[i]; + } + } else { + for (const auto & backend_ptr : backends) { + ggml_backend_t backend = backend_ptr.get(); + ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend); + ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend); + } } return ret; } diff --git a/src/llama-context.h b/src/llama-context.h index ed6d82cb396f9..f03e4cca79242 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -206,7 +206,8 @@ struct llama_context { ggml_status graph_compute(ggml_cgraph * gf, bool batched); // reserve a graph with a dummy ubatch of the specified size - ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false); + ggml_cgraph * graph_reserve( + uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr); private: llm_graph_params graph_params( @@ -281,9 +282,10 @@ struct llama_context { std::vector> set_n_threads_fns; - // buffer types used for the compute buffer of each backend + // pointers and buffer types used for the compute buffer of each backend std::vector backend_ptrs; std::vector backend_buft; + std::vector backend_buf_exp_size; // expected buffer sizes llm_graph_result_ptr gf_res_prev; llm_graph_result_ptr gf_res_reserve; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 6fcf91b7daa47..b0b373e9af99f 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -34,6 +34,7 @@ struct llama_hparams_convnext { struct llama_hparams { bool vocab_only; + bool no_alloc; bool rope_finetuned; bool use_par_res; bool swin_norm; diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 6ec709dd323a6..87b9d8bb27a55 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -25,6 +25,10 @@ time_meas::~time_meas() { } } +void llama_log_get(ggml_log_callback * log_callback, void ** user_data) { + ggml_log_get(log_callback, user_data); +} + void llama_log_set(ggml_log_callback log_callback, void * user_data) { ggml_log_set(log_callback, user_data); g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index add74391f0c47..b0338d302c331 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache( // allocate tensors and initialize the buffers to avoid NaNs in the padding for (auto & [buft, ctx] : ctx_map) { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); + ggml_backend_buffer_t buf; + if (model.hparams.no_alloc) { + buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer + for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) { + t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it + } + } else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer + } if (!buf) { throw std::runtime_error("failed to allocate buffer for kv cache"); } @@ -476,9 +484,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { std::map llama_kv_cache::memory_breakdown() const { std::map ret; - for (const auto & [_, buf] : ctxs_bufs) { - ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); + for (const auto & [ctx, buf] : ctxs_bufs) { + ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get()); + + if (hparams.no_alloc) { + GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr); + ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft); + } else { + GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); + ret[buft] += ggml_backend_buffer_get_size(buf.get()); + } } + return ret; } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index aa3a65f87a542..ca2ea2461d223 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader( std::vector & splits, bool use_mmap, bool check_tensors, + bool no_alloc, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { int trace = 0; @@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader( this->use_mmap = use_mmap; this->check_tensors = check_tensors; + this->no_alloc = no_alloc; } std::string llama_model_loader::get_arch_name() const { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index c9189f6cb4466..0380c92fde0e3 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -71,6 +71,7 @@ struct llama_model_loader { bool use_mmap = false; bool check_tensors; + bool no_alloc; llama_files files; llama_ftype ftype; @@ -97,6 +98,7 @@ struct llama_model_loader { std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, bool check_tensors, + bool no_alloc, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bb83a04e96055..a0cedcd415275 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6187,9 +6187,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { std::vector bufs; if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) { + GGML_ASSERT(!ml.no_alloc); for (uint32_t idx = 0; idx < ml.files.size(); idx++) { // only the mmap region containing the tensors in the model is mapped to the backend buffer - // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, + // then we could just use metal for all layers // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size void * addr = nullptr; size_t first, last; // NOLINT @@ -6205,9 +6207,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) { bufs.emplace_back(buf); buf_map.emplace(idx, buf); } - } - else { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + } else { + ggml_backend_buffer_t buf; + if (ml.no_alloc) { + buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them + } + } else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer + } if (buf == nullptr) { throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } @@ -6262,6 +6271,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + if (ml.no_alloc) { + return true; + } + // load tensor data for (auto & [ctx, buf_map] : ctx_buf_maps) { if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { @@ -6304,9 +6317,18 @@ size_t llama_model::n_devices() const { std::map llama_model::memory_breakdown() const { std::map ret; - for (const auto & [_, bufs] : pimpl->ctxs_bufs) { - for (const auto & buf : bufs) { - ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); + for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) { + if (hparams.no_alloc) { + GGML_ASSERT(bufs.size() == 1); + ggml_backend_buffer_t buf = bufs[0].get(); + GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr); + ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf); + ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft); + } else { + for (const auto & buf : bufs) { + GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); + ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); + } } } return ret; @@ -6351,6 +6373,7 @@ void llama_model::print_info() const { // hparams LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str()); LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only); + LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc); if (!hparams.vocab_only) { LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); @@ -20218,6 +20241,7 @@ llama_model_params llama_model_default_params() { /*.check_tensors =*/ false, /*.use_extra_bufts =*/ true, /*.no_host =*/ false, + /*.no_alloc =*/ false, }; return result; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 6dd40412b488e..7c560aac23f97 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index ab2e9868af468..c6527a8908bba 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1,6 +1,9 @@ +#include "llama.h" + #include "llama-impl.h" #include "llama-chat.h" +#include "llama-context.h" #include "llama-mmap.h" #include "llama-vocab.h" #include "llama-model-loader.h" @@ -11,6 +14,7 @@ #include "ggml-backend.h" #include +#include #include #include #include @@ -37,6 +41,542 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty GGML_ABORT("fatal error"); } +struct llama_device_memory_data { + int64_t total; + int64_t free; + llama_memory_breakdown_data mb; +}; + +static std::vector llama_get_device_memory_data( + const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams, + std::vector & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert, const ggml_log_level log_level) { + struct user_data_t { + struct { + ggml_log_callback callback; + void * user_data; + } original_logger; + ggml_log_level min_level; // prints below this log level go to debug log + }; + user_data_t ud; + llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data); + ud.min_level = log_level; + + llama_log_set([](ggml_log_level level, const char * text, void * user_data) { + const user_data_t * ud = (const user_data_t *) user_data; + const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG; + ud->original_logger.callback(level_eff, text, ud->original_logger.user_data); + }, &ud); + + llama_model_params mparams_copy = *mparams; + mparams_copy.no_alloc = true; + mparams_copy.use_mmap = false; + + llama_model * model = llama_model_load_from_file(path_model, mparams_copy); + if (model == nullptr) { + throw std::runtime_error("failed to load model"); + } + + llama_context * ctx = llama_init_from_model(model, *cparams); + if (ctx == nullptr) { + llama_model_free(model); + throw std::runtime_error("failed to create llama_context from model"); + } + + std::vector ret(model->devices.size()); + + std::map memory_breakdown = ctx->memory_breakdown(); + + for (const auto & buft_mb : memory_breakdown) { + ggml_backend_buffer_type_t buft = buft_mb.first; + const llama_memory_breakdown_data & mb = buft_mb.second; + + if (ggml_backend_buft_is_host(buft)) { + continue; + } + + ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); + if (!dev) { + continue; + } + for (size_t i = 0; i < ret.size(); i++) { + if (model->devices[i] == dev) { + ret[i].mb.model += mb.model; + ret[i].mb.context += mb.context; + ret[i].mb.compute += mb.compute; + break; + } + } + } + for (size_t i = 0; i < ret.size(); i++) { + size_t free, total; + ggml_backend_dev_memory(model->devices[i], &free, &total); + ret[i].free = free; + ret[i].total = total; + } + + devs = model->devices; + hp_ngl = model->hparams.n_layer; + hp_n_ctx_train = model->hparams.n_ctx_train; + hp_n_expert = model->hparams.n_expert; + + llama_memory_breakdown_print(ctx); // goes to debug log + + llama_free(ctx); + llama_model_free(model); + llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); + return ret; +} + + +bool llama_params_fit( + const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, + float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overides, + size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { + constexpr int64_t MiB = 1024*1024; + const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits + typedef std::vector dmds_t; + const llama_model_params default_mparams = llama_model_default_params(); + + std::vector devs; + uint32_t hp_ngl = 0; // hparams.n_gpu_layers + uint32_t hp_nct = 0; // hparams.n_ctx_train + uint32_t hp_nex = 0; // hparams.n_expert + + // step 1: get data for default parameters and check whether any changes are necessary in the first place + + LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__); + const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); + const size_t nd = devs.size(); // number of devices + if (nd == 0) { + LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__); + return true; + } + + std::vector dev_names; + { + dev_names.reserve(nd); + size_t max_length = 0; + for (ggml_backend_dev_t dev : devs) { + std::string name = ggml_backend_dev_name(dev); + name += " ("; + name += ggml_backend_dev_description(dev); + name += ")"; + dev_names.push_back(name); + max_length = std::max(max_length, name.length()); + } + for (std::string & dn : dev_names) { + dn.insert(dn.end(), max_length - dn.length(), ' '); + } + } + + int64_t sum_total = 0; + int64_t sum_projected_free = 0; + int64_t min_projected_free = INT64_MAX; + int64_t sum_projected_used = 0; + int64_t sum_projected_ctx = 0; + + if (nd > 1) { + LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); + } + for (size_t id = 0; id < nd; id++) { + const llama_device_memory_data & dmd = dmds_full[id]; + + const int64_t projected_used = dmd.mb.model + dmd.mb.context + dmd.mb.compute; + const int64_t projected_free = dmd.free - projected_used; + + sum_total += dmd.total; + sum_projected_used += projected_used; + sum_projected_free += projected_free; + min_projected_free = std::min(min_projected_free, projected_free); + sum_projected_ctx += dmd.mb.context; + + if (nd > 1) { + LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n", + __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB, + projected_free >= 0 ? "surplus" : "deficit"); + } + } + LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. a total of %" PRId64 " MiB\n", + __func__, sum_projected_used/MiB, sum_total/MiB); + if (min_projected_free >= margin) { + if (nd == 1) { + LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", + __func__, min_projected_free/MiB, margin/MiB); + return true; + } + LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n", + __func__, min_projected_free/MiB, margin/MiB); + return true; + } + + // step 2: try reducing memory use by reducing the context size + + { + int64_t global_surplus = sum_projected_free - int64_t(nd)*margin; + if (global_surplus < 0) { + if (nd == 1) { + LLAMA_LOG_INFO("%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n", + __func__, margin/MiB, -global_surplus/MiB); + } else { + LLAMA_LOG_INFO("%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n", + __func__, margin/MiB, -global_surplus/MiB); + } + + if (cparams->n_ctx == 0) { + if (hp_nct > n_ctx_min) { + const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct; + const uint32_t ctx_reduction = std::min( + uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min); + cparams->n_ctx = hp_nct - ctx_reduction; + const int64_t memory_reduction = ctx_reduction * bytes_per_ctx; + global_surplus += memory_reduction; + LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory\n", + __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); + } else { + LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", + __func__, hp_nct, n_ctx_min); + } + } else { + LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); + } + } + if (global_surplus > 0) { + LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__); + return true; + } + } + + if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { + LLAMA_LOG_INFO("%s: n_gpu_layers already set by user to %" PRId32 ", abort\n", __func__, mparams->n_gpu_layers); + return false; + } + if (nd > 1) { + if (!tensor_split) { + LLAMA_LOG_INFO("%s: did not provide a buffer to write the tensor_split to, abort\n", __func__); + return false; + } + if (mparams->tensor_split) { + for (size_t id = 0; id < nd; id++) { + if (mparams->tensor_split[id] != 0.0f) { + LLAMA_LOG_INFO("%s: model_params::tensor_split already set by user, abort\n", __func__); + return false; + } + } + } + if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) { + LLAMA_LOG_INFO("%s: changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort\n", __func__); + return false; + } + } + if (hp_nex > 0 && !tensor_buft_overides) { + LLAMA_LOG_INFO("%s: did not provide buffer to set tensor_buft_overrides for MoE model, abort\n", __func__); + return false; + } + if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) { + LLAMA_LOG_INFO("%s: model_params::tensor_buft_overrides already set by user, abort\n", __func__); + return false; + } + + // utility function that returns the memory use per device for a constant number of layers per device + auto get_memory_for_const_layer = [&](const int layers_per_device) -> std::vector { + llama_model_params mparams_copy = *mparams; + mparams_copy.n_gpu_layers = nd * layers_per_device; + if (nd > 1) { + for (size_t id = 0; id < nd; id++) { + tensor_split[id] = 1.0f; + } + } + mparams_copy.tensor_split = tensor_split; + const dmds_t dmd_nl = llama_get_device_memory_data( + path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); + std::vector ret; + ret.reserve(nd); + for (const llama_device_memory_data & dmd : dmd_nl) { + ret.push_back(dmd.mb.model + dmd.mb.context + dmd.mb.compute); + } + return ret; + }; + + struct memory_scaling { + int64_t base = 0; + int64_t per_layer = 0; + }; + + // utility function that returns how memory use scales with the number of GPU layers per device + auto get_memory_scaling = [&](const std::vector & mem_1l, const std::vector & mem_nl, const uint32_t n) -> std::vector { + std::vector ret(nd); + for (size_t id = 0; id < nd; id++) { + ret[id].per_layer = (mem_nl[id] - mem_1l[id]) / int64_t(n - 1); + ret[id].base = mem_1l[id] - ret[id].per_layer; + } + return ret; + }; + + if (hp_nex > 0) { + const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors + ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type(); + tensor_buft_overides[0] = {pattern_moe_all.c_str(), cpu_buft}; + tensor_buft_overides[1] = {nullptr, nullptr}; + mparams->tensor_buft_overrides = tensor_buft_overides; + + LLAMA_LOG_DEBUG("%s: getting device memory data for all MoE tensors in system memory:\n", __func__); + const dmds_t dmds_cpu_moe = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); + int64_t global_surplus = 0; + for (const llama_device_memory_data & dmd : dmds_cpu_moe) { + global_surplus += dmd.free; + global_surplus -= int64_t(dmd.mb.model + dmd.mb.context + dmd.mb.compute) + margin; + } + if (global_surplus > 0) { + LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n", __func__, global_surplus/MiB); + + // step 3: for MoE models, if at least the dense tensors can be fit, try fitting as many full layers as possible + + const uint32_t nl_scaling = hp_ngl / nd; + std::vector spl_part; // size per device and per partial == Moe only layer + { + LLAMA_LOG_DEBUG("%s: getting device memory data for 1 layer + all MoE tensors in system memory:\n", __func__); + auto tmp1 = get_memory_for_const_layer(1); + LLAMA_LOG_DEBUG("%s: getting device memory data for %" PRIu32 " layers + all MoE tensors in system memory:\n", __func__, nl_scaling); + auto tmpn = get_memory_for_const_layer(nl_scaling); + spl_part = get_memory_scaling(tmp1, tmpn, nl_scaling); + } + for (size_t id = 0; id < nd; id++) { + LLAMA_LOG_DEBUG("%s: spl_part[%zu]: base=%" PRId64 " MiB, per_layer=%" PRId64 " MiB\n", + __func__, id, spl_part[id].base/MiB, spl_part[id].per_layer/MiB); + } + + // for spl_part all MoE tensors were still on CPU, reset the TBOs so that all tensors are on the devices again + tensor_buft_overides[0] = {nullptr, nullptr}; + mparams->tensor_buft_overrides = tensor_buft_overides; + + std::vector spl_full; // size per device and per full layer + { + LLAMA_LOG_DEBUG("%s: getting device memory data for 1 layer + all tensors in device memory:\n", __func__); + auto tmp1 = get_memory_for_const_layer(1); + LLAMA_LOG_DEBUG("%s: getting device memory data for %" PRIu32 " layers + all tensors in device memory:\n", __func__, nl_scaling); + auto tmpn = get_memory_for_const_layer(nl_scaling); + spl_full = get_memory_scaling(tmp1, tmpn, nl_scaling); + } + for (size_t id = 0; id < nd; id++) { + LLAMA_LOG_DEBUG("%s: spl_full[%zu]: base=%" PRId64 " MiB, per_layer=%" PRId64 " MiB\n", + __func__, id, spl_full[id].base/MiB, spl_full[id].per_layer/MiB); + } + + // the non-repeating tensors (e.g. output matrix) are difficult to quantify, + // get memory use with all tensors on the last device and use that as the starting point for the last device only + for (size_t id = 0; id < nd - 1; id++) { + tensor_split[id] = 0.0f; + } + tensor_split[nd - 1] = 1.0f; + LLAMA_LOG_DEBUG("%s: getting device memory data with entire model on last device:\n", __func__); + const dmds_t dmds_last = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); + tensor_split[nd - 1] = 0.0f; + + struct ngl { + uint32_t part = 0; + uint32_t full = 0; + + explicit operator std::string() const { + return "[" + std::to_string(part) + ", " + std::to_string(full) + "]"; + } + }; + + // utility function that distributes layers to devices and returns whether the memory margin can be met on all devices + // - ngl_per_device: resulting distribution of dense-only/full layers across devices + // - global_ngl_part: total number of sense-only layers + auto distribute_layers = [&](std::vector & ngl_per_device, std::vector & usable_memory, const uint32_t global_ngl_part) -> bool { + // reset result to initial state, initially put entire model on the last device + for (size_t id = 0; id < nd - 1; id++) { + ngl_per_device[id] = {0, 0}; + } + ngl_per_device.back().part = 0; + ngl_per_device.back().full = hp_ngl + 1; + + // usable_memory: free memory above margin that can be used for further allocations + for (size_t id = 0; id < nd - 1; id++) { + int64_t um = dmds_last[id].free - margin - spl_full[id].base; + um = std::max(um, int64_t(0)); + usable_memory[id] = um; + } + { + const llama_memory_breakdown_data & mb = dmds_last.back().mb; + usable_memory.back() = dmds_last.back().free - int64_t(mb.model + mb.context + mb.context) - margin; + } + + // convert some layers on the last device from full layers to dense-only layers + ngl_per_device.back().full -= global_ngl_part; + usable_memory.back() += spl_full.back().per_layer*global_ngl_part; + ngl_per_device.back().part += global_ngl_part; + usable_memory.back() -= spl_part.back().per_layer*global_ngl_part; + + // for a single device checking the usable memory is always sufficient: + if (nd == 1) { + return usable_memory.back() >= 0; + } + + // iterate over devices from front to back and move layers to other devices until memory requirements are met + // move full layers first, then dense-only layers + for (int id = nd - 1; id >= 0 && usable_memory.back() < 0; id--) { + uint32_t ngl_move = ngl_per_device.back().full - 1; + ngl_move = std::min(ngl_move, uint32_t( usable_memory[id] / spl_full[id].per_layer)); + + // round up the number of layers only if there are insuffient dense-only layers to cover the deficit: + if (-usable_memory.back() < int64_t(ngl_per_device.back().part)*spl_part.back().per_layer) { + ngl_move = std::min(ngl_move, + uint32_t((-usable_memory.back() + spl_full.back().per_layer - 1) / spl_full.back().per_layer)); + } else { + ngl_move = std::min(ngl_move, uint32_t(-usable_memory.back() / spl_full.back().per_layer)); + } + + ngl_per_device.back().full -= ngl_move; + ngl_per_device[id].full += ngl_move; + usable_memory.back() += ngl_move * spl_full.back().per_layer; + usable_memory[id] -= ngl_move * spl_full[id].per_layer; + } + for (int id = nd - 1; id >= 0 && usable_memory.back() < 0; id--) { + uint32_t ngl_move = ngl_per_device.back().part; + ngl_move = std::min(ngl_move, uint32_t(usable_memory[id] / spl_part[id].per_layer)); + ngl_move = std::min(ngl_move, + uint32_t((-usable_memory.back() + spl_part.back().per_layer - 1) / spl_part.back().per_layer)); + + ngl_per_device.back().part -= ngl_move; + ngl_per_device[id].part += ngl_move; + usable_memory.back() += ngl_move * spl_part.back().per_layer; + usable_memory[id] -= ngl_move * spl_part[id].per_layer; + } + + // by design all but the last device have only been filled up to their margin, + // therefore only the last device needs to be checked + return usable_memory.back() >= 0; + }; + + // iteratively increase the number of partial layers until the memory consumption is low enough + std::vector ngl_per_device(nd); + { + std::vector usable_memory(nd); + for (uint32_t global_ngl_part = 0; global_ngl_part < hp_ngl; global_ngl_part++) { + const bool success = distribute_layers(ngl_per_device, usable_memory, global_ngl_part); + std::string ngl_per_device_str = std::string(ngl_per_device[0]); + std::string usable_memory_str = std::to_string(usable_memory[0]/MiB); + for (size_t id = 1; id < nd; id++) { + ngl_per_device_str += ", " + std::string(ngl_per_device[id]); + usable_memory_str += ", " + std::to_string(usable_memory[id]/MiB); + } + LLAMA_LOG_DEBUG("%s: global_ngl_part=%" PRIu32 ", success=%d, ngl_per_device=[%s], usable_memory[MiB]=[%s]\n", + __func__, global_ngl_part, success ? 1 : 0, ngl_per_device_str.c_str(), usable_memory_str.c_str()); + if (success) { + break; + } + } + } + + // utility function that returns a static C string matching the MoE tensors for a specific layer: + auto get_moe_pattern = [&](const size_t il) -> const char * { + static std::vector patterns; + while (patterns.size() <= il) { + patterns.push_back("blk\\." + std::to_string(patterns.size()) + "\\.ffn_(up|down|gate)_(ch|)exps"); + } + return patterns[il].c_str(); + }; + + // iterate over devices, add 1 TBO per dense-only layer, track total number of layers + uint32_t global_ngl_part = 0; + uint32_t global_ngl_full = 0; + bool sufficient_tbo = true; + { + const size_t ntbo = llama_max_tensor_buft_overrides(); + size_t itbo = 0; + uint32_t il0 = 0; + for (size_t id = 0; id < nd && itbo + 1 < ntbo; id++) { + for (uint32_t il = il0; il < il0 + ngl_per_device[id].part; il++) { + if (itbo + 1 >= ntbo) { + LLAMA_LOG_INFO("%s: llama_params_fit_n_tensor_buft_overrides() == %zu is insufficient for model\n", __func__, ntbo); + sufficient_tbo = false; + break; + } + tensor_buft_overides[itbo].pattern = get_moe_pattern(il); + tensor_buft_overides[itbo].buft = cpu_buft; + itbo++; + } + const uint32_t ngl = ngl_per_device[id].part + ngl_per_device[id].full; + tensor_split[id] = ngl; + il0 += ngl; + + global_ngl_part += ngl_per_device[id].part; + global_ngl_full += ngl_per_device[id].full; + } + tensor_buft_overides[itbo].pattern = nullptr; + tensor_buft_overides[itbo].buft = nullptr; + itbo++; + mparams->tensor_buft_overrides = tensor_buft_overides; + } + + const llama_memory_breakdown_data & mb_last = dmds_last.back().mb; + const int64_t projected_use_last = int64_t(mb_last.model + mb_last.context + mb_last.compute) + - int64_t(hp_ngl + 1 - ngl_per_device.back().full) * spl_full.back().per_layer + + int64_t(ngl_per_device.back().part) * spl_part.back().per_layer; + const int64_t projected_margin_last = dmds_last.back().free - projected_use_last; + + if (nd == 1) { + LLAMA_LOG_INFO("%s: set to use %u dense-only layers and %u full layers, %" PRId64 " MiB used, %" PRId64 " MiB free\n", + __func__, ngl_per_device.back().part, ngl_per_device.back().full, projected_use_last/MiB, projected_margin_last/MiB); + return sufficient_tbo; + } + LLAMA_LOG_INFO("%s: set to use %u dense-only and %u full GPU layers in total, projected memory use:\n", + __func__, global_ngl_part, global_ngl_full); + for (size_t id = 0; id < nd - 1; id++) { + const int64_t projected_use = spl_full[id].base + + int64_t(ngl_per_device[id].part)*spl_part[id].per_layer + int64_t(ngl_per_device[id].full)*spl_full[id].per_layer; + const int64_t projected_margin = dmds_last[id].free - projected_use; + LLAMA_LOG_INFO("%s: - %s: %2" PRIu32 " dense-only layers, %2" PRIu32 " full layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", + __func__, dev_names[id].c_str(), ngl_per_device[id].part, ngl_per_device[id].full, projected_use/MiB, projected_margin/MiB); + } + LLAMA_LOG_INFO("%s: - %s: %2" PRIu32 " dense-only layers, %2" PRIu32 " full layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", + __func__, dev_names.back().c_str(), ngl_per_device.back().part, ngl_per_device.back().full, projected_use_last/MiB, projected_margin_last/MiB); + return sufficient_tbo; + } + + LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n", __func__, -global_surplus/MiB); + } + + // step 4: if the model only has dense tensors or there is insufficient memory to fit all dense tensors, + // all layers are the same so simply determine how many layers will fit per device + + const uint32_t nl_scaling = hp_ngl / nd; + std::vector ms; + { + LLAMA_LOG_DEBUG("%s: getting device memory data for 1 full layer:\n", __func__); + auto tmp1 = get_memory_for_const_layer(1); + LLAMA_LOG_DEBUG("%s: getting device memory data for %" PRIu32 " full layers:\n", __func__, nl_scaling); + auto tmpn = get_memory_for_const_layer(nl_scaling); + ms = get_memory_scaling(tmp1, tmpn, nl_scaling); + } + + mparams->n_gpu_layers = 0; + std::vector ngl_per_device; + ngl_per_device.reserve(nd); + for (size_t id = 0; id < nd; id++) { + const uint32_t ngl = (dmds_full[id].free - margin - ms[id].base) / ms[id].per_layer; + mparams->n_gpu_layers += ngl; + ngl_per_device.push_back(ngl); + } + if (nd == 1) { + const int64_t projected_use = ms[0].base + int64_t(ngl_per_device[0])*ms[0].per_layer; + const int64_t projected_margin = dmds_full[0].free - projected_use; + LLAMA_LOG_INFO("%s: set n_gpu_layers to %" PRIu32 ", projected to use %" PRId64 " MiB with %" PRId64 " MiB free\n", + __func__, mparams->n_gpu_layers, projected_use/MiB, projected_margin/MiB); + return true; + } + LLAMA_LOG_INFO("%s: set n_gpu_layers to %" PRIu32 ", projected memory use:\n", __func__, mparams->n_gpu_layers); + for (size_t id = 0; id < nd; id++) { + const int64_t projected_use = ms[id].base + int64_t(ngl_per_device[id])*ms[id].per_layer; + const int64_t projected_margin = dmds_full[id].free - projected_use; + LLAMA_LOG_INFO("%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", + __func__, dev_names[id].c_str(), ngl_per_device[id], projected_use/MiB, projected_margin/MiB); + } + return true; +} + struct llama_sampler_chain_params llama_sampler_chain_default_params() { struct llama_sampler_chain_params result = { /*.no_perf =*/ true, @@ -49,6 +589,10 @@ size_t llama_max_devices(void) { return 16; } +size_t llama_max_tensor_buft_overrides() { + return 4096; +} + bool llama_supports_mmap(void) { return llama_mmap::SUPPORTED; } @@ -108,11 +652,12 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); model.hparams.vocab_only = params.vocab_only; + model.hparams.no_alloc = params.no_alloc; try { model.load_arch(ml);