Skip to content

Commit c52140d

Browse files
Merge pull request #285 from menloresearch/update-dev-from-master-2025-10-11-00-31
Sync master with upstream release b6730
2 parents 7034082 + e60f01d commit c52140d

File tree

8 files changed

+55
-7
lines changed

8 files changed

+55
-7
lines changed

ggml/src/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,9 @@ endif()
145145
# which was introduced in POSIX.1-2008, forcing us to go higher
146146
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
147147
add_compile_definitions(_XOPEN_SOURCE=700)
148+
elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
149+
# Don't define _XOPEN_SOURCE. We need _ALL_SOURCE, which is the default,
150+
# in order to define _SC_PHYS_PAGES.
148151
else()
149152
add_compile_definitions(_XOPEN_SOURCE=600)
150153
endif()

src/llama-sampling.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2541,8 +2541,13 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
25412541
if (n_non_eog == 0) {
25422542
cur_p->size = 1;
25432543
cur_p->data[0].id = ctx->vocab->token_eot();
2544+
if (cur_p->data[0].id == LLAMA_TOKEN_NULL) {
2545+
cur_p->data[0].id = ctx->vocab->token_eos();
2546+
}
25442547
cur_p->data[0].logit = 1.0f;
25452548

2549+
GGML_ASSERT(cur_p->data[0].id != LLAMA_TOKEN_NULL);
2550+
25462551
return;
25472552
}
25482553

src/llama-vocab.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2171,6 +2171,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
21712171
|| t.first == "<|end|>"
21722172
|| t.first == "<end_of_turn>"
21732173
|| t.first == "<|endoftext|>"
2174+
|| t.first == "<|end_of_text|>" // granite
21742175
|| t.first == "<EOT>"
21752176
|| t.first == "_<EOT>"
21762177
|| t.first == "<|end▁of▁sentence|>" // DeepSeek

tools/server/public/index.html.gz

-8 Bytes
Binary file not shown.

tools/server/server.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3727,7 +3727,7 @@ struct server_context {
37273727
}
37283728
} else {
37293729
if (slot.n_prompt_tokens() >= slot.n_ctx) {
3730-
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
3730+
send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
37313731
slot.release();
37323732
continue;
37333733
}
@@ -4226,7 +4226,7 @@ struct server_context {
42264226
metrics.on_prompt_eval(slot);
42274227
}
42284228

4229-
slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
4229+
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
42304230

42314231
completion_token_output result;
42324232
result.tok = id;
@@ -4368,7 +4368,7 @@ struct server_context {
43684368

43694369
static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
43704370
// skip GH copilot requests when using default port
4371-
if (req.path == "/v1/health" || req.path == "/v1/completions") {
4371+
if (req.path == "/v1/health") {
43724372
return;
43734373
}
43744374

@@ -4955,9 +4955,17 @@ int main(int argc, char ** argv) {
49554955
// Everything else, including multimodal completions.
49564956
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
49574957
}
4958-
4958+
const size_t n_ctx_slot = ctx_server.n_ctx / ctx_server.params_base.n_parallel;
49594959
tasks.reserve(inputs.size());
49604960
for (size_t i = 0; i < inputs.size(); i++) {
4961+
auto n_prompt_tokens = inputs[i].size();
4962+
if (n_prompt_tokens >= n_ctx_slot) {
4963+
json error_data = format_error_response("the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
4964+
error_data["n_prompt_tokens"] = n_prompt_tokens;
4965+
error_data["n_ctx"] = n_ctx_slot;
4966+
res_error(res, error_data);
4967+
return;
4968+
}
49614969
server_task task = server_task(type);
49624970

49634971
task.id = ctx_server.queue_tasks.get_new_id();

tools/server/tests/unit/test_chat_completion.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,28 @@ def test_context_size_exceeded():
408408
assert res.body["error"]["n_ctx"] == server.n_ctx // server.n_slots
409409

410410

411+
def test_context_size_exceeded_stream():
412+
global server
413+
server.start()
414+
try:
415+
for _ in server.make_stream_request("POST", "/chat/completions", data={
416+
"messages": [
417+
{"role": "system", "content": "Book"},
418+
{"role": "user", "content": "What is the best book"},
419+
] * 100, # make the prompt too long
420+
"stream": True}):
421+
pass
422+
assert False, "Should have failed"
423+
except ServerError as e:
424+
assert e.code == 400
425+
assert "error" in e.body
426+
assert e.body["error"]["type"] == "exceed_context_size_error"
427+
assert e.body["error"]["n_prompt_tokens"] > 0
428+
assert server.n_ctx is not None
429+
assert server.n_slots is not None
430+
assert e.body["error"]["n_ctx"] == server.n_ctx // server.n_slots
431+
432+
411433
@pytest.mark.parametrize(
412434
"n_batch,batch_count,reuse_cache",
413435
[

tools/server/tests/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ class ServerResponse:
3535
body: dict | Any
3636

3737

38+
class ServerError(Exception):
39+
def __init__(self, code, body):
40+
self.code = code
41+
self.body = body
42+
43+
3844
class ServerProcess:
3945
# default options
4046
debug: bool = False
@@ -297,6 +303,8 @@ def make_stream_request(
297303
response = requests.post(url, headers=headers, json=data, stream=True)
298304
else:
299305
raise ValueError(f"Unimplemented method: {method}")
306+
if response.status_code != 200:
307+
raise ServerError(response.status_code, response.json())
300308
for line_bytes in response.iter_lines():
301309
line = line_bytes.decode("utf-8")
302310
if '[DONE]' in line:

tools/server/webui/src/lib/services/chat.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,10 @@ export class ChatService {
122122
requestBody.reasoning_format = currentConfig.disableReasoningFormat ? 'none' : 'auto';
123123

124124
if (temperature !== undefined) requestBody.temperature = temperature;
125-
// Set max_tokens to -1 (infinite) if not provided or empty
126-
requestBody.max_tokens =
127-
max_tokens !== undefined && max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
125+
if (max_tokens !== undefined) {
126+
// Set max_tokens to -1 (infinite) when explicitly configured as 0 or null
127+
requestBody.max_tokens = max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
128+
}
128129

129130
if (dynatemp_range !== undefined) requestBody.dynatemp_range = dynatemp_range;
130131
if (dynatemp_exponent !== undefined) requestBody.dynatemp_exponent = dynatemp_exponent;

0 commit comments

Comments
 (0)