From b85e14943b4c6f658a824c58a98650baff7e3721 Mon Sep 17 00:00:00 2001
From: ishaangandhi <ishaangandhi@gmail.com>
Date: Mon, 10 Mar 2025 18:30:55 -0400
Subject: [PATCH 1/6] Respect n_predict=-2 in server

---
 examples/server/server.cpp | 17 ++++++++++++-----
 test.sh                    | 12 ++++++++++++
 2 files changed, 24 insertions(+), 5 deletions(-)
 create mode 100755 test.sh

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8cb8d0033f7d9..3b5ccc587ad15 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1321,17 +1321,24 @@ struct server_slot {
             && are_lora_equal(lora, other_slot.lora);
     }
 
+    //  There are two caps on the budge of a single request:
+    //  * [params.n_predict]
+    //  * [global_params.n_predict]
+    // This function returns true if the request is not limited by either of them.
     bool has_budget(const common_params & global_params) {
         if (params.n_predict == -1 && global_params.n_predict == -1) {
             return true; // limitless
         }
+        n_remaining = INT32_MAX;
 
-        n_remaining = -1;
+        // The request or server have finite limits on the number of tokens to generate.
+        if ((params.n_predict != -1 && params.n_predict != -2) || (global_params.n_predict  != -1 && global_params.n_predict != -2)) {
+            n_remaining = std::min(n_remaining, params.n_predict - n_decoded);
+        }
 
-        if (params.n_predict != -1) {
-            n_remaining = params.n_predict - n_decoded;
-        } else if (global_params.n_predict != -1) {
-            n_remaining = global_params.n_predict - n_decoded;
+        // The request or server have limits based on the context window.
+        if (params.n_predict == -2 || global_params.n_predict == -2) {
+            n_remaining = std::min(n_remaining, n_ctx - n_decoded);
         }
 
         return n_remaining > 0; // no budget
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000000000..0f276fa739be2
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,12 @@
+curl --location 'http://localhost:8080/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer no-key' \
+--data '{
+"messages": [
+{
+"role": "user",
+"content": "Count from 1 to 4097 one at a time, separating each number with a newline. You should not abbreviate the numbers, but list out every single one."
+}
+],
+"n_predict": -2
+}'

From f3fdca7ee67fa106a243db9a12f62270c6661796 Mon Sep 17 00:00:00 2001
From: ishaangandhi <ishaangandhi@gmail.com>
Date: Mon, 10 Mar 2025 18:37:54 -0400
Subject: [PATCH 2/6] Remove test.sh

---
 test.sh | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100755 test.sh

diff --git a/test.sh b/test.sh
deleted file mode 100755
index 0f276fa739be2..0000000000000
--- a/test.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-curl --location 'http://localhost:8080/v1/chat/completions' \
---header 'Content-Type: application/json' \
---header 'Authorization: Bearer no-key' \
---data '{
-"messages": [
-{
-"role": "user",
-"content": "Count from 1 to 4097 one at a time, separating each number with a newline. You should not abbreviate the numbers, but list out every single one."
-}
-],
-"n_predict": -2
-}'

From 7199eb9dede658773ba4d941068dfc1b71b5fd54 Mon Sep 17 00:00:00 2001
From: ishaangandhi <ishaangandhi@gmail.com>
Date: Tue, 11 Mar 2025 10:31:27 -0400
Subject: [PATCH 3/6] Add test that when n_predict=-2 predicted_n==n_ctx

---
 examples/server/tests/unit/test_completion.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py
index 0ed5b99bef4e4..8568ea4e26974 100644
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@@ -426,3 +426,18 @@ def test_cancel_request():
     time.sleep(1) # wait for HTTP_POLLING_SECONDS
     res = server.make_request("GET", "/slots")
     assert res.body[0]["is_processing"] == False
+
+
+def test_context_window_sized_completion():
+    global server
+    server.n_ctx = 16
+    server.n_predict = -1
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": -2,
+        "prompt": "The 50 states in the US are ",
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["predicted_n"] == server.n_ctx
+    assert res.body["stop_type"] == "limit"
+    assert type(res.body["has_new_line"]) == bool

From 8511ec549b9b60c19cdf2069427c888cc71ca6a1 Mon Sep 17 00:00:00 2001
From: ishaangandhi <ishaangandhi@gmail.com>
Date: Tue, 11 Mar 2025 16:25:21 -0400
Subject: [PATCH 4/6] Improve readability

---
 examples/server/server.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3b5ccc587ad15..1b4961d425122 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1331,8 +1331,8 @@ struct server_slot {
         }
         n_remaining = INT32_MAX;
 
-        // The request or server have finite limits on the number of tokens to generate.
-        if ((params.n_predict != -1 && params.n_predict != -2) || (global_params.n_predict  != -1 && global_params.n_predict != -2)) {
+        // The request or server have specified limits on the number of tokens to generate.
+        if ((params.n_predict >= 0) || (global_params.n_predict  >= 0)) {
             n_remaining = std::min(n_remaining, params.n_predict - n_decoded);
         }
 

From ff419295a0c3fbbd823086db7f6823cf23a4fa3f Mon Sep 17 00:00:00 2001
From: ishaangandhi <ishaangandhi@gmail.com>
Date: Wed, 12 Mar 2025 09:13:45 -0400
Subject: [PATCH 5/6] Unlimit the n_predict in slots

---
 examples/server/tests/unit/test_completion.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py
index 8568ea4e26974..b66c4a993ede3 100644
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@@ -143,6 +143,7 @@ def test_consistent_result_same_seed(n_slots: int):
 def test_different_result_different_seed(n_slots: int):
     global server
     server.n_slots = n_slots
+    server.n_predict = -1
     server.start()
     last_res = None
     for seed in range(4):
@@ -150,6 +151,7 @@ def test_different_result_different_seed(n_slots: int):
             "prompt": "I believe the meaning of life is",
             "seed": seed,
             "temperature": 1.0,
+            "n_predict": -1,
             "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
         })
         if last_res is not None:

From f94e1059b2af56bb609888ecaad19e67159663fb Mon Sep 17 00:00:00 2001
From: ishaangandhi <ishaangandhi@gmail.com>
Date: Wed, 12 Mar 2025 13:47:35 -0400
Subject: [PATCH 6/6] Create new server settings object for test

---
 examples/server/tests/unit/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py
index b66c4a993ede3..532c08597c7ab 100644
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@@ -431,7 +431,7 @@ def test_cancel_request():
 
 
 def test_context_window_sized_completion():
-    global server
+    server = ServerPreset.tinyllama2()
     server.n_ctx = 16
     server.n_predict = -1
     server.start()