Skip to content

Commit 2ac4b42

Browse files
author
firecoperana
committed
disable pipeline parallel if allocation failed
1 parent 70ac64e commit 2ac4b42

File tree

1 file changed

+13
-4
lines changed

1 file changed

+13
-4
lines changed

src/llama.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4497,10 +4497,19 @@ struct llama_context * llama_new_context_with_model(
44974497
ggml_cgraph * gf = llm_build_context::llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
44984498

44994499
// initialize scheduler with the worst-case graph
4500-
if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
4501-
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
4502-
llama_free(ctx);
4503-
return nullptr;
4500+
bool gf_success = ggml_backend_sched_reserve(ctx->sched, gf);
4501+
if (!gf_success)
4502+
{
4503+
if (pipeline_parallel) {
4504+
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
4505+
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, false);
4506+
gf_success = ggml_backend_sched_reserve(ctx->sched, gf);
4507+
}
4508+
if (!gf_success) {
4509+
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
4510+
llama_free(ctx);
4511+
return nullptr;
4512+
}
45044513
}
45054514

45064515
for (size_t i = 0; i < ctx->backends.size(); i++) {

0 commit comments

Comments
 (0)