Skip to content

Commit 663027f

Browse files
authored
context : fix n_outputs during reserve (ggml-org#15858)
ggml-ci
1 parent cf0e3ba commit 663027f

File tree

2 files changed

+4
-3
lines changed

2 files changed

+4
-3
lines changed

src/llama-context.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,8 +285,8 @@ llama_context::llama_context(
285285
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
286286
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
287287

288-
// avoid reserving graphs with zero outputs
289-
n_outputs = 1;
288+
// avoid reserving graphs with zero outputs - assume one output per sequence
289+
n_outputs = n_seqs;
290290

291291
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
292292

src/llama-graph.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1431,7 +1431,8 @@ ggml_tensor * llm_graph_context::build_attn(
14311431

14321432
// [TAG_NO_CACHE_PAD]
14331433
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
1434-
assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
1434+
// but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
1435+
//assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
14351436

14361437
ggml_tensor * q = q_cur;
14371438
ggml_tensor * k = k_cur;

0 commit comments

Comments
 (0)