Skip to content

Commit c1dbea7

Browse files
authored
context : restore preemptive sched reset when LLAMA_SET_ROWS=0 (ggml-org#14870)
ggml-ci
1 parent 749e0d2 commit c1dbea7

File tree

2 files changed

+17
-1
lines changed

2 files changed

+17
-1
lines changed

src/llama-context.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ llama_context::llama_context(
105105

106106
{
107107
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
108-
const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
108+
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
109109

110110
if (!supports_set_rows && !cparams.kv_unified) {
111111
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@@ -899,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
899899
}
900900
}
901901

902+
if (!supports_set_rows) {
903+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
904+
// overlap with device computation.
905+
ggml_backend_sched_reset(sched.get());
906+
}
907+
902908
// TODO: hacky solution
903909
if (model.arch == LLM_ARCH_T5 && t_embd) {
904910
//cross.t_embd = t_embd;
@@ -1229,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
12291235
// wait for the computation to finish (automatically done when obtaining the model output)
12301236
//synchronize();
12311237

1238+
if (!supports_set_rows) {
1239+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
1240+
// overlap with device computation.
1241+
ggml_backend_sched_reset(sched.get());
1242+
}
1243+
12321244
return 0;
12331245
}
12341246

src/llama-context.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,10 @@ struct llama_context {
287287

288288
bool has_evaluated_once = false;
289289

290+
// env: LLAMA_SET_ROWS (temporary)
291+
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
292+
bool supports_set_rows = false;
293+
290294
// perf
291295
mutable int64_t t_start_us = 0;
292296
mutable int64_t t_load_us = 0;

0 commit comments

Comments
 (0)