diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp index 555b0e1098..2373e7d5b6 100644 --- a/llama.cpp/server/server.cpp +++ b/llama.cpp/server/server.cpp @@ -1711,7 +1711,12 @@ struct llama_server_context slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; } - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + // Prevent integer underflow that causes std::length_error + if (n_discard >= 0 && (size_t)n_discard < slot.cache_tokens.size()) { + slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + } else { + slot.cache_tokens.clear(); + } slot.n_past -= n_discard; diff --git a/test_kv_cache_fix.cpp b/test_kv_cache_fix.cpp new file mode 100644 index 0000000000..49650348c7 --- /dev/null +++ b/test_kv_cache_fix.cpp @@ -0,0 +1,41 @@ +#include +#include +#include + +// Test for the KV cache crash fix +void test_cache_tokens_resize_fix() { + std::cout << "Testing KV cache resize fix..." << std::endl; + + // Simulate the problematic condition + std::vector cache_tokens = {1, 2, 3, 4, 5}; + size_t original_size = cache_tokens.size(); + + // Test cases that could cause integer underflow + int n_discard_cases[] = {-1, 0, 3, 5, 10}; + + for (int n_discard : n_discard_cases) { + std::vector test_tokens = cache_tokens; + + std::cout << "Testing n_discard = " << n_discard + << " with cache size = " << test_tokens.size() << std::endl; + + // Apply the fixed logic + if (n_discard >= 0 && (size_t)n_discard < test_tokens.size()) { + test_tokens.resize(test_tokens.size() - n_discard); + std::cout << " Resized to: " << test_tokens.size() << std::endl; + } else { + test_tokens.clear(); + std::cout << " Cleared to: " << test_tokens.size() << std::endl; + } + + // Verify no crash occurred + assert(test_tokens.size() <= original_size); + } + + std::cout << "All test cases passed! KV cache fix works correctly." << std::endl; +} + +int main() { + test_cache_tokens_resize_fix(); + return 0; +} \ No newline at end of file