Skip to content

Commit 33834e7

Browse files
Update llama-context-mmojo.cpp
Signed-off-by: Brad Hutchings <[email protected]>
1 parent 7f10b6e commit 33834e7

File tree

1 file changed

+13
-1
lines changed

1 file changed

+13
-1
lines changed

src/llama-context-mmojo.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ llama_context::llama_context(
105105

106106
{
107107
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
108-
const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
108+
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
109109

110110
if (!supports_set_rows && !cparams.kv_unified) {
111111
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@@ -899,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
899899
}
900900
}
901901

902+
if (!supports_set_rows) {
903+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
904+
// overlap with device computation.
905+
ggml_backend_sched_reset(sched.get());
906+
}
907+
902908
// TODO: hacky solution
903909
if (model.arch == LLM_ARCH_T5 && t_embd) {
904910
//cross.t_embd = t_embd;
@@ -1237,6 +1243,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
12371243
// wait for the computation to finish (automatically done when obtaining the model output)
12381244
//synchronize();
12391245

1246+
if (!supports_set_rows) {
1247+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
1248+
// overlap with device computation.
1249+
ggml_backend_sched_reset(sched.get());
1250+
}
1251+
12401252
return 0;
12411253
}
12421254

0 commit comments

Comments
 (0)