Skip to content

Commit 8ef37a3

Browse files
author
Olivier Chafik
committed
Merge remote-tracking branch 'origin/master' into tool-call
2 parents 9591af1 + 3d804de commit 8ef37a3

File tree

5 files changed

+47
-9
lines changed

5 files changed

+47
-9
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ To learn more about model quantization, [read this documentation](examples/quant
423423

424424
</details>
425425

426-
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
426+
[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
427427
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
428428

429429
## [`llama-bench`](examples/llama-bench)

examples/server/README.md

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ These words will not be included in the completion, so make sure to add them to
460460
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
461461

462462
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements:
463-
```json
463+
```
464464
{
465465
"content": "<the generated completion text>",
466466
"tokens": [ generated token ids if requested ],
@@ -561,7 +561,7 @@ If `with_pieces` is `true`:
561561
```
562562

563563
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
564-
```json
564+
```
565565
{
566566
"tokens": [
567567
{"id": 198, "piece": [195]}, // hex C3
@@ -576,6 +576,18 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
576576

577577
`tokens`: Set the tokens to detokenize.
578578

579+
### POST `/apply-template`: Apply chat template to a conversation
580+
581+
Uses the server's prompt template formatting functionality to convert chat messages to a single string expected by a chat model as input, but does not perform inference. Instead, the prompt string is returned in the `prompt` field of the JSON response. The prompt can then be modified as desired (for example, to insert "Sure!" at the beginning of the model's response) before sending to `/completion` to generate the chat response.
582+
583+
*Options:*
584+
585+
`messages`: (Required) Chat turns in the same format as `/v1/chat/completions`.
586+
587+
**Response format**
588+
589+
Returns a JSON object with a field `prompt` containing a string of the input messages formatted according to the model's chat template format.
590+
579591
### POST `/embedding`: Generate embedding of a given text
580592

581593
> [!IMPORTANT]
@@ -768,7 +780,7 @@ Same as the `/v1/embeddings` endpoint.
768780

769781
**Response format**
770782

771-
```json
783+
```
772784
[
773785
{
774786
"index": 0,

examples/server/server.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4185,6 +4185,14 @@ int main(int argc, char ** argv) {
41854185
res_ok(res, root);
41864186
};
41874187

4188+
const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
4189+
auto body = json::parse(req.body);
4190+
const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
4191+
json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
4192+
4193+
res_ok(res, {{ "prompt", data.at("prompt") }});
4194+
};
4195+
41884196
const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
41894197
handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
41904198
};
@@ -4361,6 +4369,7 @@ int main(int argc, char ** argv) {
43614369
svr->Post("/v1/reranking", handle_rerank);
43624370
svr->Post("/tokenize", handle_tokenize);
43634371
svr->Post("/detokenize", handle_detokenize);
4372+
svr->Post("/apply-template", handle_apply_template);
43644373
// LoRA adapters hotswap
43654374
svr->Get ("/lora-adapters", handle_lora_adapters_list);
43664375
svr->Post("/lora-adapters", handle_lora_adapters_apply);
@@ -4439,11 +4448,13 @@ int main(int argc, char ** argv) {
44394448
ctx_server.chat_templates.template_default->source().c_str(),
44404449
common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
44414450

4442-
ctx_server.queue_tasks.on_new_task(std::bind(
4443-
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
4451+
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
4452+
ctx_server.process_single_task(task);
4453+
});
44444454

4445-
ctx_server.queue_tasks.on_update_slots(std::bind(
4446-
&server_context::update_slots, &ctx_server));
4455+
ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
4456+
ctx_server.update_slots();
4457+
});
44474458

44484459
shutdown_handler = [&](int) {
44494460
ctx_server.queue_tasks.terminate();

examples/server/tests/unit/test_chat_completion.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,21 @@ def test_chat_template():
121121
assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
122122

123123

124+
def test_apply_chat_template():
125+
global server
126+
server.chat_template = "command-r"
127+
server.start()
128+
res = server.make_request("POST", "/apply-template", data={
129+
"messages": [
130+
{"role": "system", "content": "You are a test."},
131+
{"role": "user", "content":"Hi there"},
132+
]
133+
})
134+
assert res.status_code == 200
135+
assert "prompt" in res.body
136+
assert res.body["prompt"] == "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a test.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
137+
138+
124139
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
125140
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
126141
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),

src/llama-vocab.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1692,7 +1692,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
16921692
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
16931693
linefeed_id = ids[0];
16941694
} else {
1695-
const std::vector<int> ids = tokenize("\xC4\x8A", false); // U+010A
1695+
const std::vector<int> ids = tokenize("\n", false);
16961696

16971697
//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
16981698
if (ids.empty()) {

0 commit comments

Comments
 (0)