prefill fixed

708-145 · 708-145 · commit 6f9840290036 · 2025-08-08T00:28:36.000+02:00
diff --git a/prefill.sh b/prefill.sh
@@ -1,12 +1,13 @@
 # compile with cmake --build build --config Release -j8 --target llama-cli
 
 # prepare examples
-echo "Show false settings" > query.txt
-cat mypy.ini query.txt >full.txt
+cp pyproject.toml base.txt
+echo "Question: What is the numpy version?" > query.txt
+cat base.txt query.txt >full.txt
 
 # precompute mypy.ini into prefill.save
 rm prefill.save
-./llama-cli -m ~/funstreams/granite-3.1-1b-a400m-instruct-IQ4_NL.gguf -c 2000 -st --prompt-cache prefill.save -f mypy.ini -n 1
+./llama-cli -m ~/funstreams/granite-3.1-1b-a400m-instruct-IQ4_NL.gguf -c 2000 -st --prompt-cache prefill.save -f base.txt -n 1
 
 # use prefill.save to answer query
 ./llama-cli -m ~/funstreams/granite-3.1-1b-a400m-instruct-IQ4_NL.gguf -c 2000 -st --prompt-cache prefill.save -f full.txt --prompt-cache-ro
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -81,11 +81,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
         v_cells[s].resize(kv_size);
     }
 
-    // by default, all sequence ids are mapped to the 0th stream
-    seq_to_stream.resize(LLAMA_MAX_SEQ, 0);
+    seq_to_stream.resize(n_seq_max, 0);
 
     if (n_stream > 1) {
-        seq_to_stream.resize(n_stream, 0);
         for (uint32_t s = 0; s < n_stream; ++s) {
             seq_to_stream[s] = s;
         }
@@ -223,12 +221,9 @@ void llama_kv_cache_unified::clear(bool data) {
 }
 
 bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
-
-    auto & cells = v_cells[seq_to_stream[seq_id]];
-    auto & head  = v_heads[seq_to_stream[seq_id]];
-
-    uint32_t new_head = cells.size();
+    if (seq_id != -1) {
+        GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+    }
 
     if (p0 < 0) {
         p0 = 0;
@@ -239,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
     }
 
     if (seq_id >= 0) {
+        auto & cells = v_cells[seq_to_stream[seq_id]];
+        auto & head  = v_heads[seq_to_stream[seq_id]];
+
+        uint32_t new_head = cells.size();
+
         for (uint32_t i = 0; i < cells.size(); ++i) {
             if (!cells.pos_in(i, p0, p1)) {
                 continue;
@@ -250,24 +250,34 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
                 }
             }
         }
+
+        if (new_head != cells.size() && new_head < head) {
+            head = new_head;
+        }
     } else {
         // match any sequence
-        for (uint32_t i = 0; i < cells.size(); ++i) {
-            if (!cells.pos_in(i, p0, p1)) {
-                continue;
-            }
+        for (uint32_t s = 0; s < n_stream; ++s) {
+            auto & cells = v_cells[s];
+            auto & head  = v_heads[s];
 
-            cells.rm(i);
+            uint32_t new_head = cells.size();
 
-            if (new_head == cells.size()) {
-                new_head = i;
+            for (uint32_t i = 0; i < cells.size(); ++i) {
+                if (!cells.pos_in(i, p0, p1)) {
+                    continue;
+                }
+
+                cells.rm(i);
+
+                if (new_head == cells.size()) {
+                    new_head = i;
+                }
             }
-        }
-    }
 
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cells.size() && new_head < head) {
-        head = new_head;
+            if (new_head != cells.size() && new_head < head) {
+                head = new_head;
+            }
+        }
     }
 
     return true;