Skip to content

Commit 5a4ff43

Browse files
authored
llama : disable pipeline parallelism if compute buffer allocation fails (#16748)
1 parent 10640e3 commit 5a4ff43

File tree

1 file changed

+8
-3
lines changed

1 file changed

+8
-3
lines changed

src/llama-context.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,7 @@ llama_context::llama_context(
268268
if (pipeline_parallel) {
269269
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
270270
}
271-
}
272271

273-
if (!hparams.vocab_only) {
274272
llama_memory_context_ptr mctx;
275273
if (memory) {
276274
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@@ -343,7 +341,14 @@ llama_context::llama_context(
343341
{
344342
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
345343
if (!gf) {
346-
throw std::runtime_error("failed to allocate compute pp buffers");
344+
if (pipeline_parallel) {
345+
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
346+
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
347+
gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
348+
}
349+
if (!gf) {
350+
throw std::runtime_error("failed to allocate compute pp buffers");
351+
}
347352
}
348353

349354
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());

0 commit comments

Comments
 (0)