From 4fb572c4681c6d9b66b82ad59712f97b7f09f502 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Tue, 4 Nov 2025 13:38:23 -0700
Subject: [PATCH 1/5] feat(memory): Only fail partial erasure of recurrent tail

The recurrent state is always assumed to be the state as of the last update
from the final token in the sequence. When doing a partial erasure, if the
range does not include the final token, the erasure can be considered a
success since any memory used for the sequence prior to the final token
(which is no memory) has been successfully removed.

There is one potential case that this doesn't address which is the pruning
of cache to remove sensitive data from the context. This wouldn't work for
attention cache partial removal (in the middle) either since the KV state
is linearly-dependent and states in later sequence positions would still be
based on the state from the sensitive data, even if that data is no longer
cached, so I don't think this is relevant, but it is worth noting that the
semantics of this change for a partial erasure in the middle of the cache
are essentially "my context is already compressed" and not "all trace of
the removed tokens has been removed."

https://github.com/ggml-org/llama.cpp/issues/16768
Branch: HybridContextShift-16768

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 src/llama-memory-recurrent.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
index 276e1697d466c..7cfeea640e571 100644
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -151,7 +151,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
         p1 = std::numeric_limits<llama_pos>::max();
     }
 
-    // models like Mamba or RWKV can't have a state partially erased
+    // models like Mamba or RWKV can't have a state partially erased at the end
+    // of the sequence because their state isn't preserved for previous tokens
     if (seq_id >= (int64_t) size) {
         // could be fatal
         return false;
@@ -160,8 +161,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
         int32_t & tail_id = cells[seq_id].tail;
         if (tail_id >= 0) {
             const auto & cell = cells[tail_id];
-            // partial intersection is invalid
-            if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+            // partial intersection is invalid if it includes the final pos
+            if ((0 < p0 && p1 > cell.pos)) {
                 //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
                 return false;
             }

From 24aff7fa7989c371474aebb5d575c0461cdc1cae Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Tue, 4 Nov 2025 13:40:42 -0700
Subject: [PATCH 2/5] fix(main): Check the output of seq_rm for prefix matching

This prefix matching is explicitly attempting to remove the tokens at the
end of the sequence that don't match. This is the operation that can't be
performed on a recurrent cache due to the state being updated in place, so
if this removal fails, we need to clear the whole cache.

https://github.com/ggml-org/llama.cpp/issues/16768
Branch: HybridContextShift-16768

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tools/main/main.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/main/main.cpp b/tools/main/main.cpp
index 498e00e3a5e58..252d120a3b804 100644
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -354,7 +354,10 @@ int main(int argc, char ** argv) {
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1);
+        if (!llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1)) {
+            LOG_INF("%s: unable to resuse common prefix\n", __func__);
+            llama_memory_seq_rm(mem, -1, -1, -1);
+        }
     }
 
     LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",

From 3b5902156b2978f170388345b131b85337a1408a Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <gabe.l.hart@gmail.com>
Date: Tue, 4 Nov 2025 14:35:24 -0700
Subject: [PATCH 3/5] fix(memory): Fix condition for partial erasure failure if
 p0 > pos

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

Co-authored-by: compilade <git@compilade.net>
---
 src/llama-memory-recurrent.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
index 7cfeea640e571..efc490dcc7f13 100644
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -162,7 +162,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
         if (tail_id >= 0) {
             const auto & cell = cells[tail_id];
             // partial intersection is invalid if it includes the final pos
-            if ((0 < p0 && p1 > cell.pos)) {
+            if ((0 < p0 && p0 <= cell.pos && p1 > cell.pos)) {
                 //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
                 return false;
             }

From b8f9c56923ab34bf24d4fe5476efe9b02736918d Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <gabe.l.hart@gmail.com>
Date: Wed, 5 Nov 2025 06:24:41 -0700
Subject: [PATCH 4/5] style: Fix extra parens

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 src/llama-memory-recurrent.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
index efc490dcc7f13..812bf2530491a 100644
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -162,7 +162,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
         if (tail_id >= 0) {
             const auto & cell = cells[tail_id];
             // partial intersection is invalid if it includes the final pos
-            if ((0 < p0 && p0 <= cell.pos && p1 > cell.pos)) {
+            if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
                 //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
                 return false;
             }

From 104d8ace2b68815eff61015f1472389130038597 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Thu, 6 Nov 2025 12:08:20 -0700
Subject: [PATCH 5/5] fix(main.cpp): Set n_matching_session_tokens to 0 on
 cache clear

https://github.com/ggml-org/llama.cpp/issues/16768
Branch: HybridContextShift-16768

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tools/main/main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/main/main.cpp b/tools/main/main.cpp
index 252d120a3b804..33e8862335793 100644
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -356,6 +356,7 @@ int main(int argc, char ** argv) {
         // remove any "future" tokens that we might have inherited from the previous session
         if (!llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1)) {
             LOG_INF("%s: unable to resuse common prefix\n", __func__);
+            n_matching_session_tokens = 0;
             llama_memory_seq_rm(mem, -1, -1, -1);
         }
     }