@@ -2815,42 +2815,6 @@ struct llama_kv_cache {
28152815 }
28162816};
28172817
2818- class llama_kv_cache_state {
2819- struct llama_kv_cache_state_short {
2820- uint32_t head = 0;
2821- uint32_t size = 0;
2822- uint32_t used = 0;
2823- uint32_t n = 0;
2824-
2825- std::vector<llama_kv_cell> cells;
2826- } old_state;
2827-
2828- bool saved = false;
2829-
2830- public:
2831- void save_state(const llama_kv_cache& cache) {
2832- old_state.head = cache.head;
2833- old_state.size = cache.size;
2834- old_state.used = cache.used;
2835- old_state.n = cache.n;
2836- old_state.cells = cache.cells;
2837-
2838- saved = true;
2839- }
2840-
2841- void restore(llama_kv_cache& cache) {
2842- if (saved) {
2843- cache.head = old_state.head;
2844- cache.size = old_state.size;
2845- cache.used = old_state.used;
2846- cache.n = old_state.n;
2847- cache.cells = std::move(old_state.cells);
2848-
2849- saved = false;
2850- }
2851- }
2852- };
2853-
28542818struct llama_control_vector {
28552819 std::vector<struct ggml_tensor *> tensors; // per layer
28562820 std::vector<struct ggml_context *> ctxs;
@@ -17220,7 +17184,6 @@ static int llama_decode_internal(
1722017184 lctx.n_queued_tokens += n_tokens_all;
1722117185
1722217186 auto & kv_self = lctx.kv_self;
17223- llama_kv_cache_state kv_cache_state_holder;
1722417187
1722517188 const int64_t n_embd = hparams.n_embd;
1722617189 const int64_t n_vocab = hparams.n_vocab;
@@ -17298,7 +17261,6 @@ static int llama_decode_internal(
1729817261 // non-causal masks do not use the KV cache
1729917262 if (hparams.causal_attn) {
1730017263 llama_kv_cache_update(&lctx);
17301- kv_cache_state_holder.save_state(kv_self);
1730217264
1730317265 // if we have enough unused cells before the current head ->
1730417266 // better to start searching from the beginning of the cache, hoping to fill it
@@ -17356,17 +17318,16 @@ static int llama_decode_internal(
1735617318 llama_set_inputs(lctx, ubatch);
1735717319
1735817320 const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
17359- if (compute_status != GGML_STATUS_SUCCESS) {
17360- kv_cache_state_holder.restore(kv_self);
17361- switch (compute_status) {
17362- case GGML_STATUS_ABORTED:
17363- return 2;
17364- case GGML_STATUS_ALLOC_FAILED:
17365- return -2;
17366- case GGML_STATUS_FAILED:
17367- default:
17368- return -3;
17369- }
17321+ switch (compute_status) {
17322+ case GGML_STATUS_SUCCESS:
17323+ break;
17324+ case GGML_STATUS_ABORTED:
17325+ return 2;
17326+ case GGML_STATUS_ALLOC_FAILED:
17327+ return -2;
17328+ case GGML_STATUS_FAILED:
17329+ default:
17330+ return -3;
1737017331 }
1737117332
1737217333 // update the kv ring buffer
0 commit comments