server : disallow use cases involving partial SWA context

ggerganov · ggerganov · commit 84742efdd6a2 · 2025-05-17T18:44:21.000+03:00
ggml-ci
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -1807,10 +1807,7 @@ llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {
 }
 
 bool llama_kv_cache_unified_iswa::get_can_shift() const {
-    // TODO: for now allow this, eventhough it's not mathematically correct
-    //       but some initial tests indicate that the results are not bad
-    return true;
-    //return kv_base->get_size() == kv_swa->get_size();
+    return kv_base->get_size() == kv_swa->get_size();
 }
 
 void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3198,7 +3198,14 @@ struct server_context {
                                 // if we don't cache the prompt, we have to remove the entire KV cache
                                 llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
                                 slot.n_past = 0;
-                                slot.cache_tokens.clear();
+                                slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
+                            }
+
+                            if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
+                                if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
+                                    SLT_WRN(slot, "%s", "forcing full prompt re-processing due to lack of cache data\n");
+                                    slot.n_past = 0;
+                                }
                             }
                         }
 

Original file line number	Diff line number	Diff line change
`@@ -1807,10 +1807,7 @@ llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {`
`1807`	`1807`	`}`
`1808`	`1808`
`1809`	`1809`	`bool llama_kv_cache_unified_iswa::get_can_shift() const {`
`1810`		`- // TODO: for now allow this, eventhough it's not mathematically correct`
`1811`		`- // but some initial tests indicate that the results are not bad`
`1812`		`- return true;`
`1813`		`- //return kv_base->get_size() == kv_swa->get_size();`
	`1810`	`+ return kv_base->get_size() == kv_swa->get_size();`
`1814`	`1811`	`}`
`1815`	`1812`
`1816`	`1813`	`void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {`