Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
[](gpt_params & params) {
params.ctx_shift = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(llama_arg(
{"--chunks"}, "N",
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
Expand Down
1 change: 1 addition & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ The project is under active development, and we are [looking for feedback and co
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
| `--no-context-shift` | stop generation when context window is full (default: context shift is enabled)<br/> |
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
Expand Down
16 changes: 14 additions & 2 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1464,11 +1464,10 @@ struct server_context {
std::vector<server_task_result> results(id_tasks.size());
for (size_t i = 0; i < id_tasks.size(); i++) {
server_task_result result = queue_results.recv(id_tasks);

if (result.error) {
error_handler(result.data);
cancel_tasks(id_tasks);
break;
return;
}

size_t idx = result.data["index"];
Expand Down Expand Up @@ -1815,6 +1814,13 @@ struct server_context {
for (server_slot & slot : slots) {
if (slot.ga_n == 1) {
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
if (!params.ctx_shift){
slot.release();
slot.print_timings();
send_final_response(slot);
metrics.on_prediction(slot);
continue;
}
// Shift context
const int n_keep = slot.params.n_keep + add_bos_token;
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
Expand Down Expand Up @@ -1940,6 +1946,12 @@ struct server_context {
send_final_response(slot);
continue;
}
// context shift is disabled and prompt is too large - discard it
if (!params.ctx_shift && (slot.n_prompt_tokens > slot.n_ctx) ){

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if this leads to correct behavior. I found that if we do:

slot.n_prompt_tokens > slot.n_ctx

Then, it's possible to fall through the check down to prompt truncation which might be confusing for the user. Maybe we should change it to:

slot.n_prompt_tokens >= slot.n_ctx

Maybe someone more knowledgeable could chime in.

slot.release();
send_error(slot, "Input is too large to process. Either enable context shift or increase the context length.", ERROR_TYPE_INVALID_REQUEST);
continue;
}

if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
// this prompt is too large to process - discard it
Expand Down
Loading