Fix critical KV cache crash bug causing std::length_error

Anivar A Aravind · Anivar A Aravind · commit 5b73d10f110a · 2025-07-28T06:44:23.000+05:30
Fixes integer underflow when n_discard >= cache_tokens.size() that causes std::length_error crashes. This commonly occurs during KV cache context shifting, particularly with Chinese text translation workloads. The fix adds proper bounds checking before resizing the cache_tokens vector. Fixes #771
diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp
@@ -1711,7 +1711,12 @@ struct llama_server_context
                         slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
                     }
 
-                    slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+                    // Prevent integer underflow that causes std::length_error
+                    if (n_discard >= 0 && (size_t)n_discard < slot.cache_tokens.size()) {
+                        slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+                    } else {
+                        slot.cache_tokens.clear();
+                    }
 
                     slot.n_past -= n_discard;