File tree Expand file tree Collapse file tree 1 file changed +13
-1
lines changed Expand file tree Collapse file tree 1 file changed +13
-1
lines changed Original file line number Diff line number Diff line change @@ -105,7 +105,7 @@ llama_context::llama_context(
105105
106106 {
107107 const char * LLAMA_SET_ROWS = getenv (" LLAMA_SET_ROWS" );
108- const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi (LLAMA_SET_ROWS) != 0 ) : false ;
108+ supports_set_rows = LLAMA_SET_ROWS ? (atoi (LLAMA_SET_ROWS) != 0 ) : false ;
109109
110110 if (!supports_set_rows && !cparams.kv_unified ) {
111111 LLAMA_LOG_WARN (" %s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n " , __func__);
@@ -899,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
899899 }
900900 }
901901
902+ if (!supports_set_rows) {
903+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
904+ // overlap with device computation.
905+ ggml_backend_sched_reset (sched.get ());
906+ }
907+
902908 // TODO: hacky solution
903909 if (model.arch == LLM_ARCH_T5 && t_embd) {
904910 // cross.t_embd = t_embd;
@@ -1237,6 +1243,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
12371243 // wait for the computation to finish (automatically done when obtaining the model output)
12381244 // synchronize();
12391245
1246+ if (!supports_set_rows) {
1247+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
1248+ // overlap with device computation.
1249+ ggml_backend_sched_reset (sched.get ());
1250+ }
1251+
12401252 return 0 ;
12411253}
12421254
You can’t perform that action at this time.
0 commit comments