From b85e14943b4c6f658a824c58a98650baff7e3721 Mon Sep 17 00:00:00 2001 From: ishaangandhi Date: Mon, 10 Mar 2025 18:30:55 -0400 Subject: [PATCH 1/6] Respect n_predict=-2 in server --- examples/server/server.cpp | 17 ++++++++++++----- test.sh | 12 ++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) create mode 100755 test.sh diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8cb8d0033f7d9..3b5ccc587ad15 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1321,17 +1321,24 @@ struct server_slot { && are_lora_equal(lora, other_slot.lora); } + // There are two caps on the budge of a single request: + // * [params.n_predict] + // * [global_params.n_predict] + // This function returns true if the request is not limited by either of them. bool has_budget(const common_params & global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless } + n_remaining = INT32_MAX; - n_remaining = -1; + // The request or server have finite limits on the number of tokens to generate. + if ((params.n_predict != -1 && params.n_predict != -2) || (global_params.n_predict != -1 && global_params.n_predict != -2)) { + n_remaining = std::min(n_remaining, params.n_predict - n_decoded); + } - if (params.n_predict != -1) { - n_remaining = params.n_predict - n_decoded; - } else if (global_params.n_predict != -1) { - n_remaining = global_params.n_predict - n_decoded; + // The request or server have limits based on the context window. + if (params.n_predict == -2 || global_params.n_predict == -2) { + n_remaining = std::min(n_remaining, n_ctx - n_decoded); } return n_remaining > 0; // no budget diff --git a/test.sh b/test.sh new file mode 100755 index 0000000000000..0f276fa739be2 --- /dev/null +++ b/test.sh @@ -0,0 +1,12 @@ +curl --location 'http://localhost:8080/v1/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer no-key' \ +--data '{ +"messages": [ +{ +"role": "user", +"content": "Count from 1 to 4097 one at a time, separating each number with a newline. You should not abbreviate the numbers, but list out every single one." +} +], +"n_predict": -2 +}' From f3fdca7ee67fa106a243db9a12f62270c6661796 Mon Sep 17 00:00:00 2001 From: ishaangandhi Date: Mon, 10 Mar 2025 18:37:54 -0400 Subject: [PATCH 2/6] Remove test.sh --- test.sh | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100755 test.sh diff --git a/test.sh b/test.sh deleted file mode 100755 index 0f276fa739be2..0000000000000 --- a/test.sh +++ /dev/null @@ -1,12 +0,0 @@ -curl --location 'http://localhost:8080/v1/chat/completions' \ ---header 'Content-Type: application/json' \ ---header 'Authorization: Bearer no-key' \ ---data '{ -"messages": [ -{ -"role": "user", -"content": "Count from 1 to 4097 one at a time, separating each number with a newline. You should not abbreviate the numbers, but list out every single one." -} -], -"n_predict": -2 -}' From 7199eb9dede658773ba4d941068dfc1b71b5fd54 Mon Sep 17 00:00:00 2001 From: ishaangandhi Date: Tue, 11 Mar 2025 10:31:27 -0400 Subject: [PATCH 3/6] Add test that when n_predict=-2 predicted_n==n_ctx --- examples/server/tests/unit/test_completion.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 0ed5b99bef4e4..8568ea4e26974 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -426,3 +426,18 @@ def test_cancel_request(): time.sleep(1) # wait for HTTP_POLLING_SECONDS res = server.make_request("GET", "/slots") assert res.body[0]["is_processing"] == False + + +def test_context_window_sized_completion(): + global server + server.n_ctx = 16 + server.n_predict = -1 + server.start() + res = server.make_request("POST", "/completion", data={ + "n_predict": -2, + "prompt": "The 50 states in the US are ", + }) + assert res.status_code == 200 + assert res.body["timings"]["predicted_n"] == server.n_ctx + assert res.body["stop_type"] == "limit" + assert type(res.body["has_new_line"]) == bool From 8511ec549b9b60c19cdf2069427c888cc71ca6a1 Mon Sep 17 00:00:00 2001 From: ishaangandhi Date: Tue, 11 Mar 2025 16:25:21 -0400 Subject: [PATCH 4/6] Improve readability --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3b5ccc587ad15..1b4961d425122 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1331,8 +1331,8 @@ struct server_slot { } n_remaining = INT32_MAX; - // The request or server have finite limits on the number of tokens to generate. - if ((params.n_predict != -1 && params.n_predict != -2) || (global_params.n_predict != -1 && global_params.n_predict != -2)) { + // The request or server have specified limits on the number of tokens to generate. + if ((params.n_predict >= 0) || (global_params.n_predict >= 0)) { n_remaining = std::min(n_remaining, params.n_predict - n_decoded); } From ff419295a0c3fbbd823086db7f6823cf23a4fa3f Mon Sep 17 00:00:00 2001 From: ishaangandhi Date: Wed, 12 Mar 2025 09:13:45 -0400 Subject: [PATCH 5/6] Unlimit the n_predict in slots --- examples/server/tests/unit/test_completion.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 8568ea4e26974..b66c4a993ede3 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -143,6 +143,7 @@ def test_consistent_result_same_seed(n_slots: int): def test_different_result_different_seed(n_slots: int): global server server.n_slots = n_slots + server.n_predict = -1 server.start() last_res = None for seed in range(4): @@ -150,6 +151,7 @@ def test_different_result_different_seed(n_slots: int): "prompt": "I believe the meaning of life is", "seed": seed, "temperature": 1.0, + "n_predict": -1, "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed }) if last_res is not None: From f94e1059b2af56bb609888ecaad19e67159663fb Mon Sep 17 00:00:00 2001 From: ishaangandhi Date: Wed, 12 Mar 2025 13:47:35 -0400 Subject: [PATCH 6/6] Create new server settings object for test --- examples/server/tests/unit/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index b66c4a993ede3..532c08597c7ab 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -431,7 +431,7 @@ def test_cancel_request(): def test_context_window_sized_completion(): - global server + server = ServerPreset.tinyllama2() server.n_ctx = 16 server.n_predict = -1 server.start()