Merge pull request #285 from menloresearch/update-dev-from-master-2025-10-11-00-31

jan-service-account · web-flow · commit c52140dd161d · 2025-10-11T07:32:20.000+07:00
Sync master with upstream release b6730
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -145,6 +145,9 @@ endif()
 # which was introduced in POSIX.1-2008, forcing us to go higher
 if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
     add_compile_definitions(_XOPEN_SOURCE=700)
+elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
+    # Don't define _XOPEN_SOURCE.  We need _ALL_SOURCE, which is the default,
+    # in order to define _SC_PHYS_PAGES.
 else()
     add_compile_definitions(_XOPEN_SOURCE=600)
 endif()
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -2541,8 +2541,13 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     if (n_non_eog == 0) {
         cur_p->size = 1;
         cur_p->data[0].id = ctx->vocab->token_eot();
+        if (cur_p->data[0].id == LLAMA_TOKEN_NULL) {
+            cur_p->data[0].id = ctx->vocab->token_eos();
+        }
         cur_p->data[0].logit = 1.0f;
 
+        GGML_ASSERT(cur_p->data[0].id != LLAMA_TOKEN_NULL);
+
         return;
     }
 
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -2171,6 +2171,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<|end|>"
                         || t.first == "<end_of_turn>"
                         || t.first == "<|endoftext|>"
+                        || t.first == "<|end_of_text|>" // granite
                         || t.first == "<EOT>"
                         || t.first == "_<EOT>"
                         || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3727,7 +3727,7 @@ struct server_context {
                             }
                         } else {
                             if (slot.n_prompt_tokens() >= slot.n_ctx) {
-                                send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
+                                send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
                                 slot.release();
                                 continue;
                             }
@@ -4226,7 +4226,7 @@ struct server_context {
                     metrics.on_prompt_eval(slot);
                 }
 
-                slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
+                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
 
                 completion_token_output result;
                 result.tok          = id;
@@ -4368,7 +4368,7 @@ struct server_context {
 
 static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
     // skip GH copilot requests when using default port
-    if (req.path == "/v1/health" || req.path == "/v1/completions") {
+    if (req.path == "/v1/health") {
         return;
     }
 
@@ -4955,9 +4955,17 @@ int main(int argc, char ** argv) {
                 // Everything else, including multimodal completions.
                 inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
             }
-
+            const size_t n_ctx_slot = ctx_server.n_ctx / ctx_server.params_base.n_parallel;
             tasks.reserve(inputs.size());
             for (size_t i = 0; i < inputs.size(); i++) {
+                auto n_prompt_tokens = inputs[i].size();
+                if (n_prompt_tokens >= n_ctx_slot) {
+                    json error_data = format_error_response("the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
+                    error_data["n_prompt_tokens"] = n_prompt_tokens;
+                    error_data["n_ctx"] = n_ctx_slot;
+                    res_error(res, error_data);
+                    return;
+                }
                 server_task task = server_task(type);
 
                 task.id    = ctx_server.queue_tasks.get_new_id();
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
@@ -408,6 +408,28 @@ def test_context_size_exceeded():
     assert res.body["error"]["n_ctx"] == server.n_ctx // server.n_slots
 
 
+def test_context_size_exceeded_stream():
+    global server
+    server.start()
+    try:
+        for _ in server.make_stream_request("POST", "/chat/completions", data={
+            "messages": [
+                {"role": "system", "content": "Book"},
+                {"role": "user", "content": "What is the best book"},
+            ] * 100, # make the prompt too long
+            "stream": True}):
+                pass
+        assert False, "Should have failed"
+    except ServerError as e:
+        assert e.code == 400
+        assert "error" in e.body
+        assert e.body["error"]["type"] == "exceed_context_size_error"
+        assert e.body["error"]["n_prompt_tokens"] > 0
+        assert server.n_ctx is not None
+        assert server.n_slots is not None
+        assert e.body["error"]["n_ctx"] == server.n_ctx // server.n_slots
+
+
 @pytest.mark.parametrize(
     "n_batch,batch_count,reuse_cache",
     [
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
@@ -35,6 +35,12 @@ class ServerResponse:
     body: dict | Any
 
 
+class ServerError(Exception):
+    def __init__(self, code, body):
+        self.code = code
+        self.body = body
+
+
 class ServerProcess:
     # default options
     debug: bool = False
@@ -297,6 +303,8 @@ def make_stream_request(
             response = requests.post(url, headers=headers, json=data, stream=True)
         else:
             raise ValueError(f"Unimplemented method: {method}")
+        if response.status_code != 200:
+            raise ServerError(response.status_code, response.json())
         for line_bytes in response.iter_lines():
             line = line_bytes.decode("utf-8")
             if '[DONE]' in line:
diff --git a/tools/server/webui/src/lib/services/chat.ts b/tools/server/webui/src/lib/services/chat.ts
@@ -122,9 +122,10 @@ export class ChatService {
 		requestBody.reasoning_format = currentConfig.disableReasoningFormat ? 'none' : 'auto';
 
 		if (temperature !== undefined) requestBody.temperature = temperature;
-		// Set max_tokens to -1 (infinite) if not provided or empty
-		requestBody.max_tokens =
-			max_tokens !== undefined && max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
+		if (max_tokens !== undefined) {
+			// Set max_tokens to -1 (infinite) when explicitly configured as 0 or null
+			requestBody.max_tokens = max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
+		}
 
 		if (dynatemp_range !== undefined) requestBody.dynatemp_range = dynatemp_range;
 		if (dynatemp_exponent !== undefined) requestBody.dynatemp_exponent = dynatemp_exponent;