@@ -194,6 +194,8 @@ int main(int argc, char ** argv) {
194194 llama_model * model = llama_init.model .get ();
195195 llama_context * ctx = llama_init.context .get ();
196196
197+ auto * mem = llama_get_memory (ctx);
198+
197199 const llama_vocab * vocab = llama_model_get_vocab (model);
198200
199201 // load the prompts from an external file if there are any
@@ -259,7 +261,7 @@ int main(int argc, char ** argv) {
259261
260262 // assign the system KV cache to all parallel sequences
261263 for (int32_t i = 1 ; i <= n_clients; ++i) {
262- llama_kv_self_seq_cp (ctx , 0 , i, -1 , -1 );
264+ llama_memory_seq_cp (mem , 0 , i, -1 , -1 );
263265 }
264266
265267 LOG_INF (" \n " );
@@ -286,9 +288,9 @@ int main(int argc, char ** argv) {
286288 if (batch.n_tokens == 0 ) {
287289 // all sequences have ended - clear the entire KV cache
288290 for (int i = 1 ; i <= n_clients; ++i) {
289- llama_kv_self_seq_rm (ctx , i, -1 , -1 );
291+ llama_memory_seq_rm (mem , i, -1 , -1 );
290292 // but keep the system prompt
291- llama_kv_self_seq_cp (ctx , 0 , i, -1 , -1 );
293+ llama_memory_seq_cp (mem , 0 , i, -1 , -1 );
292294 }
293295
294296 LOG_INF (" %s: clearing the KV cache\n " , __func__);
@@ -447,8 +449,8 @@ int main(int argc, char ** argv) {
447449 }
448450
449451 // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
450- llama_kv_self_seq_rm (ctx , client.id + 1 , -1 , -1 );
451- llama_kv_self_seq_cp (ctx , 0 , client.id + 1 , -1 , -1 );
452+ llama_memory_seq_rm (mem , client.id + 1 , -1 , -1 );
453+ llama_memory_seq_cp (mem , 0 , client.id + 1 , -1 , -1 );
452454
453455 const auto t_main_end = ggml_time_us ();
454456
0 commit comments