diff --git a/common/common.cpp b/common/common.cpp index 1e761b6da..b6d859b33 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2653,7 +2653,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { auto mparams = llama_model_params_from_gpt_params(params); llama_model * model = nullptr; - + if (!params.hf_repo.empty() && !params.hf_file.empty()) { model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); } else if (!params.model_url.empty()) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d2b92e75d..537334bf0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1237,6 +1237,10 @@ std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix, in return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix; } +void llama_model::set_tensor_overrides(const llama_model_params& params) { + tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern; +} + std::string llama_model_ftype_name(llama_ftype ftype) { if (ftype & LLAMA_FTYPE_GUESSED) { return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; diff --git a/src/llama-model.h b/src/llama-model.h index a26c7cb3d..a8ea67b91 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -305,10 +305,18 @@ struct llama_model { // keep track of loaded lora adapters std::set lora_adapters; + bool tensor_overrides; + ~llama_model(); // Not actually needed, but left in place for now size_t max_nodes() const { return 65536; } + + bool has_tensor_overrides() const { + return tensor_overrides; + }; + + void set_tensor_overrides(const llama_model_params& params); }; struct llama_lora_weight { diff --git a/src/llama.cpp b/src/llama.cpp index b740b7c18..e07748ffa 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3968,7 +3968,7 @@ struct llama_model * llama_load_model_from_file( return true; }; } - + model->set_tensor_overrides(params); // model->devices hold device indices that are used to offload // use model->devices to determine offload device // if no device is specified, all device are included @@ -4478,7 +4478,7 @@ struct llama_context * llama_new_context_with_model( llama_get_device_count(*model) > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER && - params.offload_kqv; + params.offload_kqv && !model->has_tensor_overrides(); #ifndef GGML_USE_CUDA // pipeline parallelism requires support for async compute and events // currently this is only implemented in the CUDA backend @@ -4497,10 +4497,19 @@ struct llama_context * llama_new_context_with_model( ggml_cgraph * gf = llm_build_context::llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true); // initialize scheduler with the worst-case graph - if (!ggml_backend_sched_reserve(ctx->sched, gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - llama_free(ctx); - return nullptr; + bool gf_success = ggml_backend_sched_reserve(ctx->sched, gf); + if (!gf_success) + { + if (pipeline_parallel) { + LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__); + ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, false); + gf_success = ggml_backend_sched_reserve(ctx->sched, gf); + } + if (!gf_success) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + llama_free(ctx); + return nullptr; + } } for (size_t i = 0; i < ctx->backends.size(); i++) {