Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2653,7 +2653,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
auto mparams = llama_model_params_from_gpt_params(params);

llama_model * model = nullptr;

if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else if (!params.model_url.empty()) {
Expand Down
4 changes: 4 additions & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1237,6 +1237,10 @@ std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix, in
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
}

void llama_model::set_tensor_overrides(const llama_model_params& params) {
tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
}

std::string llama_model_ftype_name(llama_ftype ftype) {
if (ftype & LLAMA_FTYPE_GUESSED) {
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
Expand Down
8 changes: 8 additions & 0 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,10 +305,18 @@ struct llama_model {
// keep track of loaded lora adapters
std::set<llama_lora_adapter *> lora_adapters;

bool tensor_overrides;

~llama_model();

// Not actually needed, but left in place for now
size_t max_nodes() const { return 65536; }

bool has_tensor_overrides() const {
return tensor_overrides;
};

void set_tensor_overrides(const llama_model_params& params);
};

struct llama_lora_weight {
Expand Down
21 changes: 15 additions & 6 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3968,7 +3968,7 @@ struct llama_model * llama_load_model_from_file(
return true;
};
}

model->set_tensor_overrides(params);
// model->devices hold device indices that are used to offload
// use model->devices to determine offload device
// if no device is specified, all device are included
Expand Down Expand Up @@ -4478,7 +4478,7 @@ struct llama_context * llama_new_context_with_model(
llama_get_device_count(*model) > 1 &&
model->n_gpu_layers > (int)model->hparams.n_layer &&
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
params.offload_kqv;
params.offload_kqv && !model->has_tensor_overrides();
#ifndef GGML_USE_CUDA
// pipeline parallelism requires support for async compute and events
// currently this is only implemented in the CUDA backend
Expand All @@ -4497,10 +4497,19 @@ struct llama_context * llama_new_context_with_model(
ggml_cgraph * gf = llm_build_context::llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);

// initialize scheduler with the worst-case graph
if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
llama_free(ctx);
return nullptr;
bool gf_success = ggml_backend_sched_reserve(ctx->sched, gf);
if (!gf_success)
{
if (pipeline_parallel) {
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, false);
gf_success = ggml_backend_sched_reserve(ctx->sched, gf);
}
if (!gf_success) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
llama_free(ctx);
return nullptr;
}
}

for (size_t i = 0; i < ctx->backends.size(); i++) {
Expand Down