File tree Expand file tree Collapse file tree 2 files changed +17
-1
lines changed Expand file tree Collapse file tree 2 files changed +17
-1
lines changed Original file line number Diff line number Diff line change @@ -105,7 +105,7 @@ llama_context::llama_context(
105
105
106
106
{
107
107
const char * LLAMA_SET_ROWS = getenv (" LLAMA_SET_ROWS" );
108
- const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi (LLAMA_SET_ROWS) != 0 ) : false ;
108
+ supports_set_rows = LLAMA_SET_ROWS ? (atoi (LLAMA_SET_ROWS) != 0 ) : false ;
109
109
110
110
if (!supports_set_rows && !cparams.kv_unified ) {
111
111
LLAMA_LOG_WARN (" %s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n " , __func__);
@@ -899,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
899
899
}
900
900
}
901
901
902
+ if (!supports_set_rows) {
903
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
904
+ // overlap with device computation.
905
+ ggml_backend_sched_reset (sched.get ());
906
+ }
907
+
902
908
// TODO: hacky solution
903
909
if (model.arch == LLM_ARCH_T5 && t_embd) {
904
910
// cross.t_embd = t_embd;
@@ -1229,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
1229
1235
// wait for the computation to finish (automatically done when obtaining the model output)
1230
1236
// synchronize();
1231
1237
1238
+ if (!supports_set_rows) {
1239
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
1240
+ // overlap with device computation.
1241
+ ggml_backend_sched_reset (sched.get ());
1242
+ }
1243
+
1232
1244
return 0 ;
1233
1245
}
1234
1246
Original file line number Diff line number Diff line change @@ -287,6 +287,10 @@ struct llama_context {
287
287
288
288
bool has_evaluated_once = false ;
289
289
290
+ // env: LLAMA_SET_ROWS (temporary)
291
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14285
292
+ bool supports_set_rows = false ;
293
+
290
294
// perf
291
295
mutable int64_t t_start_us = 0 ;
292
296
mutable int64_t t_load_us = 0 ;
You can’t perform that action at this time.
0 commit comments