Skip to content

Commit b75afe3

Browse files
committed
server : fix context limit check to use slot.n_past
ggml-ci
1 parent 8a1f439 commit b75afe3

File tree

2 files changed

+10
-3
lines changed

2 files changed

+10
-3
lines changed

examples/server/server.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,12 +1102,13 @@ struct server_context {
11021102
}
11031103

11041104
// if context shift is disabled, we stop when it reaches the context limit
1105-
if (slot.n_decoded >= slot.n_ctx) {
1105+
if (slot.n_past >= slot.n_ctx) {
11061106
slot.truncated = true;
11071107
slot.stopped_limit = true;
11081108
slot.has_next_token = false;
11091109

1110-
SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx);
1110+
SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
1111+
slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
11111112
}
11121113

11131114
if (llama_token_is_eog(model, result.tok)) {
@@ -1797,7 +1798,7 @@ struct server_context {
17971798
// apply context-shift if needed
17981799
// TODO: simplify and improve
17991800
for (server_slot & slot : slots) {
1800-
if (slot.is_processing() && slot.n_past >= slot.n_ctx - 1) {
1801+
if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
18011802
if (!params.ctx_shift) {
18021803
// this check is redundant (for good)
18031804
// we should never get here, because generation should already stopped in process_token()
@@ -1960,6 +1961,8 @@ struct server_context {
19601961
} else {
19611962
if (!params.ctx_shift) {
19621963
// if context shift is disabled, we make sure prompt size is smaller than KV size
1964+
// TODO: there should be a separate parameter that control prompt truncation
1965+
// context shift should be applied only during the generation phase
19631966
if (slot.n_prompt_tokens >= slot.n_ctx) {
19641967
slot.release();
19651968
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);

examples/server/tests/features/ctx_shift.feature

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ Feature: llama.cpp server
1313
And 32 as batch size
1414
And 2 slots
1515

16+
# the prompt is 301 tokens
17+
# the slot context is 256/2 = 128 tokens
18+
# the prompt is truncated to keep the last 109 tokens
19+
# 64 tokens are generated thanks to shifting the context when it gets full
1620
Scenario: Inference with context shift
1721
And 64 server max tokens to predict
1822
Then the server is starting

0 commit comments

Comments
 (0)