@@ -431,25 +431,26 @@ bool llama_context::kv_self_update() {
431431
432432 llama_kv_cache * kv_self = static_cast <llama_kv_cache *>(memory.get ());
433433
434- if (kv_self->update (*this )) {
435- // if the KV cache did any computation, we have to reserve a new worst-case graph
436- const auto kv_state = kv_self->init_full ();
437- if (!kv_state) {
438- throw std::runtime_error (" failed to initialize KV cache" );
439- }
434+ if (!kv_self->update (*this )) {
435+ // no updates have been performed
436+ return false ;
437+ }
440438
441- const uint32_t n_seqs = cparams.n_seq_max ;
442- const uint32_t n_tokens = std::min (cparams.n_ctx , cparams.n_ubatch );
439+ // if the KV cache did any computation, we have to reserve a new worst-case graph
440+ const auto kv_state = kv_self->init_full ();
441+ if (!kv_state) {
442+ throw std::runtime_error (" failed to initialize KV cache" );
443+ }
443444
444- auto * gf = graph_reserve (n_tokens, n_seqs, n_tokens, kv_state.get ());
445- if (!gf) {
446- LLAMA_LOG_ERROR (" %s: failed to reserve graph after the KV cache update\n " , __func__);
447- }
445+ const uint32_t n_seqs = cparams.n_seq_max ;
446+ const uint32_t n_tokens = std::min (cparams.n_ctx , cparams.n_ubatch );
448447
449- return true ;
448+ auto * gf = graph_reserve (n_tokens, n_seqs, n_tokens, kv_state.get ());
449+ if (!gf) {
450+ LLAMA_LOG_ERROR (" %s: failed to reserve graph after the KV cache update\n " , __func__);
450451 }
451452
452- return false ;
453+ return true ;
453454}
454455
455456enum llama_pooling_type llama_context::pooling_type () const {
0 commit comments