Skip to content

Commit 5688864

Browse files
committed
allow disable context shift for sever
1 parent 64c6af3 commit 5688864

File tree

3 files changed

+15
-1
lines changed

3 files changed

+15
-1
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
691691
[](gpt_params & params) {
692692
params.ctx_shift = false;
693693
}
694-
).set_examples({LLAMA_EXAMPLE_MAIN}));
694+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
695695
add_opt(llama_arg(
696696
{"--chunks"}, "N",
697697
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),

examples/server/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ The project is under active development, and we are [looking for feedback and co
3737
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
3838
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
3939
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
40+
| `--no-context-shift` | stop generation when context window is full (default: context shift is enabled)<br/> |
4041
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
4142
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
4243
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |

examples/server/server.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1815,6 +1815,13 @@ struct server_context {
18151815
for (server_slot & slot : slots) {
18161816
if (slot.ga_n == 1) {
18171817
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
1818+
if (!params.ctx_shift){
1819+
slot.release();
1820+
slot.print_timings();
1821+
send_final_response(slot);
1822+
metrics.on_prediction(slot);
1823+
continue;
1824+
}
18181825
// Shift context
18191826
const int n_keep = slot.params.n_keep + add_bos_token;
18201827
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
@@ -1940,6 +1947,12 @@ struct server_context {
19401947
send_final_response(slot);
19411948
continue;
19421949
}
1950+
// context shift is disabled and prompt is too large - discard it
1951+
if (!params.ctx_shift && slot.n_prompt_tokens > slot.n_ctx ){
1952+
slot.release();
1953+
send_error(slot, "Input is too large to process. Either disable context shift or increase context length. ", ERROR_TYPE_SERVER);
1954+
continue;
1955+
}
19431956

19441957
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
19451958
// this prompt is too large to process - discard it

0 commit comments

Comments
 (0)