Skip to content

Commit a40ba49

Browse files
committed
Merge branch 'master' into gg/llama-kv-cache
2 parents c30e34c + 3d804de commit a40ba49

26 files changed

+1920
-191
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,8 @@ jobs:
346346
id: cmake_test
347347
run: |
348348
cd build
349-
ctest -L main --verbose --timeout 900
349+
# This is using llvmpipe and runs slower than other backends
350+
ctest -L main --verbose --timeout 1800
350351
351352
ubuntu-22-cmake-hip:
352353
runs-on: ubuntu-22.04

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ To learn more about model quantization, [read this documentation](examples/quant
422422

423423
</details>
424424

425-
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
425+
[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
426426
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
427427

428428
## [`llama-bench`](examples/llama-bench)

common/chat-template.hpp

Lines changed: 175 additions & 77 deletions
Large diffs are not rendered by default.

common/minja.hpp

Lines changed: 61 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -628,7 +628,7 @@ class Context : public std::enable_shared_from_this<Context> {
628628
if (parent_) return parent_->contains(key);
629629
return false;
630630
}
631-
virtual void set(const Value & key, Value & value) {
631+
virtual void set(const Value & key, const Value & value) {
632632
values_.set(key, value);
633633
}
634634
};
@@ -2648,31 +2648,34 @@ inline std::shared_ptr<Context> Context::builtins() {
26482648
return filter.call(context, actual_args);
26492649
});
26502650
};
2651-
// https://jinja.palletsprojects.com/en/3.0.x/templates/#jinja-filters.reject
2652-
globals.set("reject", Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
2653-
args.expectArgs("reject", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
2654-
auto & items = args.args[0];
2655-
auto filter_fn = context->get(args.args[1]);
2656-
if (filter_fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump());
2651+
auto select_or_reject = [make_filter](bool is_select) {
2652+
return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
2653+
args.expectArgs(is_select ? "select" : "reject", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
2654+
auto & items = args.args[0];
2655+
auto filter_fn = context->get(args.args[1]);
2656+
if (filter_fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump());
26572657

2658-
auto filter_args = Value::array();
2659-
for (size_t i = 2, n = args.args.size(); i < n; i++) {
2660-
filter_args.push_back(args.args[i]);
2661-
}
2662-
auto filter = make_filter(filter_fn, filter_args);
2658+
auto filter_args = Value::array();
2659+
for (size_t i = 2, n = args.args.size(); i < n; i++) {
2660+
filter_args.push_back(args.args[i]);
2661+
}
2662+
auto filter = make_filter(filter_fn, filter_args);
26632663

2664-
auto res = Value::array();
2665-
for (size_t i = 0, n = items.size(); i < n; i++) {
2666-
auto & item = items.at(i);
2667-
ArgumentsValue filter_args;
2668-
filter_args.args.emplace_back(item);
2669-
auto pred_res = filter.call(context, filter_args);
2670-
if (!pred_res.to_bool()) {
2671-
res.push_back(item);
2664+
auto res = Value::array();
2665+
for (size_t i = 0, n = items.size(); i < n; i++) {
2666+
auto & item = items.at(i);
2667+
ArgumentsValue filter_args;
2668+
filter_args.args.emplace_back(item);
2669+
auto pred_res = filter.call(context, filter_args);
2670+
if (pred_res.to_bool() == (is_select ? true : false)) {
2671+
res.push_back(item);
2672+
}
26722673
}
2673-
}
2674-
return res;
2675-
}));
2674+
return res;
2675+
});
2676+
};
2677+
globals.set("select", select_or_reject(/* is_select= */ true));
2678+
globals.set("reject", select_or_reject(/* is_select= */ false));
26762679
globals.set("map", Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
26772680
auto res = Value::array();
26782681
if (args.args.size() == 1 &&
@@ -2720,41 +2723,45 @@ inline std::shared_ptr<Context> Context::builtins() {
27202723
if (!text.empty() && text.back() == '\n') out += "\n";
27212724
return out;
27222725
}));
2723-
globals.set("selectattr", Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
2724-
args.expectArgs("selectattr", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
2725-
auto & items = args.args[0];
2726-
if (items.is_null())
2727-
return Value::array();
2728-
auto attr_name = args.args[1].get<std::string>();
2729-
2730-
bool has_test = false;
2731-
Value test_fn;
2732-
ArgumentsValue test_args {{Value()}, {}};
2733-
if (args.args.size() >= 3) {
2734-
has_test = true;
2735-
test_fn = context->get(args.args[2]);
2736-
if (test_fn.is_null()) throw std::runtime_error("Undefined test: " + args.args[2].dump());
2737-
for (size_t i = 3, n = args.args.size(); i < n; i++) {
2738-
test_args.args.emplace_back(args.args[i]);
2726+
auto select_or_reject_attr = [](bool is_select) {
2727+
return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
2728+
args.expectArgs(is_select ? "selectattr" : "rejectattr", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
2729+
auto & items = args.args[0];
2730+
if (items.is_null())
2731+
return Value::array();
2732+
auto attr_name = args.args[1].get<std::string>();
2733+
2734+
bool has_test = false;
2735+
Value test_fn;
2736+
ArgumentsValue test_args {{Value()}, {}};
2737+
if (args.args.size() >= 3) {
2738+
has_test = true;
2739+
test_fn = context->get(args.args[2]);
2740+
if (test_fn.is_null()) throw std::runtime_error("Undefined test: " + args.args[2].dump());
2741+
for (size_t i = 3, n = args.args.size(); i < n; i++) {
2742+
test_args.args.emplace_back(args.args[i]);
2743+
}
2744+
test_args.kwargs = args.kwargs;
27392745
}
2740-
test_args.kwargs = args.kwargs;
2741-
}
27422746

2743-
auto res = Value::array();
2744-
for (size_t i = 0, n = items.size(); i < n; i++) {
2745-
auto & item = items.at(i);
2746-
auto attr = item.get(attr_name);
2747-
if (has_test) {
2748-
test_args.args[0] = attr;
2749-
if (test_fn.call(context, test_args).to_bool()) {
2750-
res.push_back(item);
2747+
auto res = Value::array();
2748+
for (size_t i = 0, n = items.size(); i < n; i++) {
2749+
auto & item = items.at(i);
2750+
auto attr = item.get(attr_name);
2751+
if (has_test) {
2752+
test_args.args[0] = attr;
2753+
if (test_fn.call(context, test_args).to_bool() == (is_select ? true : false)) {
2754+
res.push_back(item);
2755+
}
2756+
} else {
2757+
res.push_back(attr);
27512758
}
2752-
} else {
2753-
res.push_back(attr);
27542759
}
2755-
}
2756-
return res;
2757-
}));
2760+
return res;
2761+
});
2762+
};
2763+
globals.set("selectattr", select_or_reject_attr(/* is_select= */ true));
2764+
globals.set("rejectattr", select_or_reject_attr(/* is_select= */ false));
27582765
globals.set("range", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
27592766
std::vector<int64_t> startEndStep(3);
27602767
std::vector<bool> param_set(3);

examples/server/README.md

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -236,9 +236,13 @@ npm i
236236
# to run the dev server
237237
npm run dev
238238
239-
# to build the public/index.html
239+
# to build the public/index.html.gz
240240
npm run build
241241
```
242+
After `public/index.html.gz` has been generated we need to generate the c++
243+
headers (like build/examples/server/index.html.gz.hpp) that will be included
244+
by server.cpp. This is done by building `llama-server` as described in the
245+
[build](#build) section above.
242246

243247
NOTE: if you are using the vite dev server, you can change the API base URL to llama.cpp. To do that, run this code snippet in browser's console:
244248

@@ -456,7 +460,7 @@ These words will not be included in the completion, so make sure to add them to
456460
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
457461

458462
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements:
459-
```json
463+
```
460464
{
461465
"content": "<the generated completion text>",
462466
"tokens": [ generated token ids if requested ],
@@ -557,7 +561,7 @@ If `with_pieces` is `true`:
557561
```
558562

559563
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
560-
```json
564+
```
561565
{
562566
"tokens": [
563567
{"id": 198, "piece": [195]}, // hex C3
@@ -572,6 +576,18 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
572576

573577
`tokens`: Set the tokens to detokenize.
574578

579+
### POST `/apply-template`: Apply chat template to a conversation
580+
581+
Uses the server's prompt template formatting functionality to convert chat messages to a single string expected by a chat model as input, but does not perform inference. Instead, the prompt string is returned in the `prompt` field of the JSON response. The prompt can then be modified as desired (for example, to insert "Sure!" at the beginning of the model's response) before sending to `/completion` to generate the chat response.
582+
583+
*Options:*
584+
585+
`messages`: (Required) Chat turns in the same format as `/v1/chat/completions`.
586+
587+
**Response format**
588+
589+
Returns a JSON object with a field `prompt` containing a string of the input messages formatted according to the model's chat template format.
590+
575591
### POST `/embedding`: Generate embedding of a given text
576592

577593
> [!IMPORTANT]
@@ -764,7 +780,7 @@ Same as the `/v1/embeddings` endpoint.
764780

765781
**Response format**
766782

767-
```json
783+
```
768784
[
769785
{
770786
"index": 0,

examples/server/server.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
// mime type for sending response
1515
#define MIMETYPE_JSON "application/json; charset=utf-8"
1616

17-
// auto generated files (update with ./deps.sh)
17+
// auto generated files (see README.md for details)
1818
#include "index.html.gz.hpp"
1919
#include "loading.html.hpp"
2020

@@ -4124,6 +4124,14 @@ int main(int argc, char ** argv) {
41244124
res_ok(res, root);
41254125
};
41264126

4127+
const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
4128+
auto body = json::parse(req.body);
4129+
const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
4130+
json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
4131+
4132+
res_ok(res, {{ "prompt", data.at("prompt") }});
4133+
};
4134+
41274135
const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
41284136
handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
41294137
};
@@ -4300,6 +4308,7 @@ int main(int argc, char ** argv) {
43004308
svr->Post("/v1/reranking", handle_rerank);
43014309
svr->Post("/tokenize", handle_tokenize);
43024310
svr->Post("/detokenize", handle_detokenize);
4311+
svr->Post("/apply-template", handle_apply_template);
43034312
// LoRA adapters hotswap
43044313
svr->Get ("/lora-adapters", handle_lora_adapters_list);
43054314
svr->Post("/lora-adapters", handle_lora_adapters_apply);
@@ -4378,11 +4387,13 @@ int main(int argc, char ** argv) {
43784387
ctx_server.chat_templates.template_default->source().c_str(),
43794388
common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
43804389

4381-
ctx_server.queue_tasks.on_new_task(std::bind(
4382-
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
4390+
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
4391+
ctx_server.process_single_task(task);
4392+
});
43834393

4384-
ctx_server.queue_tasks.on_update_slots(std::bind(
4385-
&server_context::update_slots, &ctx_server));
4394+
ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
4395+
ctx_server.update_slots();
4396+
});
43864397

43874398
shutdown_handler = [&](int) {
43884399
ctx_server.queue_tasks.terminate();

examples/server/tests/unit/test_chat_completion.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,21 @@ def test_chat_template():
121121
assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
122122

123123

124+
def test_apply_chat_template():
125+
global server
126+
server.chat_template = "command-r"
127+
server.start()
128+
res = server.make_request("POST", "/apply-template", data={
129+
"messages": [
130+
{"role": "system", "content": "You are a test."},
131+
{"role": "user", "content":"Hi there"},
132+
]
133+
})
134+
assert res.status_code == 200
135+
assert "prompt" in res.body
136+
assert res.body["prompt"] == "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a test.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
137+
138+
124139
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
125140
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
126141
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),

0 commit comments

Comments
 (0)