fill context size

VJHack · VJHack · commit a01591168571 · 2024-11-22T22:01:17.000-06:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,4 +13,3 @@ repos:
   rev: 7.0.0
   hooks:
   -   id: flake8
-      additional_dependencies: [flake8-no-print]
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -219,14 +219,12 @@ struct server_slot {
         if (params.n_predict == -1 && global_params.n_predict == -1) {
             return true; // limitless
         }
-        else if (global_params.n_predict == -2) {
-            return true; // generate until context is filled
-        }
-
         n_remaining = -1;
 
         if (params.n_predict != -1) {
             n_remaining = params.n_predict - n_decoded;
+        } else if (global_params.n_predict == -2) {
+            n_remaining = n_ctx - n_past - 1;
         } else if (global_params.n_predict != -1) {
             n_remaining = global_params.n_predict - n_decoded;
         }
@@ -1815,11 +1813,6 @@ struct server_context {
                     continue;
                 }
 
-                if (params.n_predict == -2) {
-                    slot.release();
-                    send_final_response(slot);
-                    continue;
-                }
 
                 // Shift context
                 const int n_keep    = slot.params.n_keep + add_bos_token;
diff --git a/examples/server/tests/features/n_predict.feature b/examples/server/tests/features/n_predict.feature
@@ -8,6 +8,7 @@ Feature: llama.cpp server
     And   a model alias tinyllama-2
     And   42 as server seed
     And   64 KV cache size
+
   Scenario: Generate N tokens
     And   12 max tokens to predict
     Then  the server is starting
@@ -18,6 +19,7 @@ Feature: llama.cpp server
     """
     And   a completion request with no api error
     Then  12 tokens are predicted
+
   Scenario: Generate tokens until context is full
     And   -2 server max tokens to predict
     Then  the server is starting
@@ -28,4 +30,3 @@ Feature: llama.cpp server
     """
     And   a completion request with no api error
     Then  11 tokens are predicted
-
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
@@ -159,7 +159,10 @@ def step_n_slots(context, n_slots: int):
 
 @step('{n_predict:d} server max tokens to predict')
 def step_server_n_predict(context, n_predict: int):
-    context.n_server_predict = n_predict if n_predict > 0 or n_predict in (-1, -2) else None
+    if n_predict > 0 or n_predict in (-1, -2):
+        context.n_server_predict = n_predict
+    else:
+        context.n_server_predict = None
 
 
 @step('{slot_save_path} as slot save path')