ggml-org · VJHack · Sep 19, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
@@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params) {
             params.ctx_shift = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--chunks"}, "N",
         format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -37,6 +37,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
 | `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
 | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
+| `--no-context-shift` | stop generation when context window is full (default: context shift is enabled)<br/> |
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1464,11 +1464,10 @@ struct server_context {
         std::vector<server_task_result> results(id_tasks.size());
         for (size_t i = 0; i < id_tasks.size(); i++) {
             server_task_result result = queue_results.recv(id_tasks);
-
             if (result.error) {
                 error_handler(result.data);
                 cancel_tasks(id_tasks);
-                break;
+                return;
             }
 
             size_t idx = result.data["index"];
@@ -1815,6 +1814,13 @@ struct server_context {
         for (server_slot & slot : slots) {
             if (slot.ga_n == 1) {
                 if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
+                    if (!params.ctx_shift){
+                        slot.release();
+                        slot.print_timings();
+                        send_final_response(slot);
+                        metrics.on_prediction(slot);
+                        continue;
+                    }
                     // Shift context
                     const int n_keep    = slot.params.n_keep + add_bos_token;
                     const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
@@ -1940,6 +1946,12 @@ struct server_context {
                             send_final_response(slot);
                             continue;
                         }
+                        // context shift is disabled and prompt is too large - discard it
+                        if (!params.ctx_shift && (slot.n_prompt_tokens > slot.n_ctx) ){
+                            slot.release();
+                            send_error(slot, "Input is too large to process. Either enable context shift or increase the context length.", ERROR_TYPE_INVALID_REQUEST);
+                            continue;
+                        }
 
                         if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
                             // this prompt is too large to process - discard it