Skip to content

Commit 1f9e927

Browse files
committed
Fix critical KV cache crash bug causing std::length_error
Resolves issue #771 where server crashes with std::length_error when KV cache context shifting attempts to resize cache_tokens vector with integer underflow. The bug occurs in update_slots() when n_discard >= cache_tokens.size(), causing cache_tokens.resize(size - n_discard) to underflow and request massive memory allocation, triggering std::length_error exception. Changes: - Add bounds checking before cache_tokens.resize() in server.cpp:1714 - Clear cache_tokens when n_discard would cause underflow - Prevent negative n_discard values from causing issues This fix prevents production server crashes reported with Chinese text translation workloads and high memory pressure scenarios.
1 parent d4135d2 commit 1f9e927

File tree

3 files changed

+47
-1
lines changed

3 files changed

+47
-1
lines changed

llama.cpp/server/server.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1711,7 +1711,12 @@ struct llama_server_context
17111711
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
17121712
}
17131713

1714-
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
1714+
// Prevent integer underflow that causes std::length_error
1715+
if (n_discard >= 0 && (size_t)n_discard < slot.cache_tokens.size()) {
1716+
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
1717+
} else {
1718+
slot.cache_tokens.clear();
1719+
}
17151720

17161721
slot.n_past -= n_discard;
17171722

test_kv_cache_fix

72.6 KB
Binary file not shown.

test_kv_cache_fix.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#include <iostream>
2+
#include <vector>
3+
#include <cassert>
4+
5+
// Simulated test for the KV cache fix
6+
void test_cache_tokens_resize_fix() {
7+
std::cout << "Testing KV cache resize fix..." << std::endl;
8+
9+
// Simulate the problematic condition
10+
std::vector<int> cache_tokens = {1, 2, 3, 4, 5};
11+
size_t original_size = cache_tokens.size();
12+
13+
// Test cases that could cause integer underflow
14+
int n_discard_cases[] = {-1, 0, 3, 5, 10};
15+
16+
for (int n_discard : n_discard_cases) {
17+
std::vector<int> test_tokens = cache_tokens;
18+
19+
std::cout << "Testing n_discard = " << n_discard
20+
<< " with cache size = " << test_tokens.size() << std::endl;
21+
22+
// Apply the fixed logic
23+
if (n_discard >= 0 && (size_t)n_discard < test_tokens.size()) {
24+
test_tokens.resize(test_tokens.size() - n_discard);
25+
std::cout << " Resized to: " << test_tokens.size() << std::endl;
26+
} else {
27+
test_tokens.clear();
28+
std::cout << " Cleared to: " << test_tokens.size() << std::endl;
29+
}
30+
31+
// Verify no crash occurred
32+
assert(test_tokens.size() <= original_size);
33+
}
34+
35+
std::cout << "All test cases passed! KV cache fix works correctly." << std::endl;
36+
}
37+
38+
int main() {
39+
test_cache_tokens_resize_fix();
40+
return 0;
41+
}

0 commit comments

Comments
 (0)