Skip to content

Commit 84742ef

Browse files
committed
server : disallow use cases involving partial SWA context
ggml-ci
1 parent ca52e19 commit 84742ef

File tree

2 files changed

+9
-5
lines changed

2 files changed

+9
-5
lines changed

src/llama-kv-cache.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1807,10 +1807,7 @@ llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {
18071807
}
18081808

18091809
bool llama_kv_cache_unified_iswa::get_can_shift() const {
1810-
// TODO: for now allow this, eventhough it's not mathematically correct
1811-
// but some initial tests indicate that the results are not bad
1812-
return true;
1813-
//return kv_base->get_size() == kv_swa->get_size();
1810+
return kv_base->get_size() == kv_swa->get_size();
18141811
}
18151812

18161813
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {

tools/server/server.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3198,7 +3198,14 @@ struct server_context {
31983198
// if we don't cache the prompt, we have to remove the entire KV cache
31993199
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
32003200
slot.n_past = 0;
3201-
slot.cache_tokens.clear();
3201+
slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
3202+
}
3203+
3204+
if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
3205+
if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
3206+
SLT_WRN(slot, "%s", "forcing full prompt re-processing due to lack of cache data\n");
3207+
slot.n_past = 0;
3208+
}
32023209
}
32033210
}
32043211

0 commit comments

Comments
 (0)