Skip to content

Commit 40cc3f2

Browse files
author
ochafik
committed
Merge branch 'tool-call' of github.com:ochafik/llama.cpp into tool-call
2 parents 4a1e8e9 + 384f54a commit 40cc3f2

File tree

7 files changed

+86
-69
lines changed

7 files changed

+86
-69
lines changed

examples/server/server.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,6 @@ struct server_task_result_cmpl_partial : server_task_result {
768768
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
769769
std::string oaicompat_model;
770770
std::string oaicompat_cmpl_id;
771-
std::shared_ptr<common_chat_parser> chat_parser;
772771

773772
virtual int get_index() override {
774773
return index;
@@ -1191,7 +1190,6 @@ struct server_slot {
11911190

11921191
std::string stopping_word;
11931192

1194-
std::shared_ptr<common_chat_parser> chat_parser;
11951193

11961194
// sampling
11971195
json json_schema;
@@ -1200,6 +1198,8 @@ struct server_slot {
12001198

12011199
llama_token sampled;
12021200

1201+
common_chat_parser chat_parser;
1202+
12031203
// stats
12041204
size_t n_sent_text = 0; // number of sent text character
12051205

@@ -3998,8 +3998,6 @@ int main(int argc, char ** argv) {
39983998

39993999
auto body = json::parse(req.body);
40004000
const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
4001-
LOG_INF("Request: %s\n", body.dump(2).c_str());
4002-
40034001
json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
40044002

40054003
return handle_completions_impl(

examples/server/tests/unit/test_tool_call.py

Lines changed: 64 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -61,28 +61,7 @@ def create_server():
6161
}
6262

6363

64-
@pytest.mark.parametrize("template_name,tool,argument_key", [
65-
("meta-llama-Meta-Llama-3.1-8B-Instruct", TEST_TOOL, "success"),
66-
("meta-llama-Meta-Llama-3.1-8B-Instruct", PYTHON_TOOL, "code"),
67-
("meetkai-functionary-medium-v3.1", TEST_TOOL, "success"),
68-
("meetkai-functionary-medium-v3.1", PYTHON_TOOL, "code"),
69-
("meetkai-functionary-medium-v3.2", TEST_TOOL, "success"),
70-
("meetkai-functionary-medium-v3.2", PYTHON_TOOL, "code"),
71-
("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL, "success"),
72-
("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL, "code"),
73-
("meta-llama-Llama-3.2-3B-Instruct", TEST_TOOL, "success"),
74-
("meta-llama-Llama-3.2-3B-Instruct", PYTHON_TOOL, "code"),
75-
("mistralai-Mistral-Nemo-Instruct-2407", TEST_TOOL, "success"),
76-
("mistralai-Mistral-Nemo-Instruct-2407", PYTHON_TOOL, "code"),
77-
("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", TEST_TOOL, "success"),
78-
("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", PYTHON_TOOL, "code"),
79-
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"),
80-
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"),
81-
("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"),
82-
("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
83-
# TODO: fix these
84-
])
85-
def test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
64+
def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
8665
n_predict = 512
8766
global server
8867
# server = ServerPreset.stories15m_moe()
@@ -117,6 +96,40 @@ def test_completion_with_required_tool_tiny(template_name: str, tool: dict, argu
11796
assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
11897

11998

99+
@pytest.mark.parametrize("template_name,tool,argument_key", [
100+
("google-gemma-2-2b-it", TEST_TOOL, "success"),
101+
("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"),
102+
("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
103+
])
104+
def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None):
105+
do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
106+
107+
108+
@pytest.mark.slow
109+
@pytest.mark.parametrize("template_name,tool,argument_key", [
110+
("meta-llama-Meta-Llama-3.1-8B-Instruct", TEST_TOOL, "success"),
111+
("meta-llama-Meta-Llama-3.1-8B-Instruct", PYTHON_TOOL, "code"),
112+
("meetkai-functionary-medium-v3.1", TEST_TOOL, "success"),
113+
("meetkai-functionary-medium-v3.1", PYTHON_TOOL, "code"),
114+
("meetkai-functionary-medium-v3.2", TEST_TOOL, "success"),
115+
("meetkai-functionary-medium-v3.2", PYTHON_TOOL, "code"),
116+
("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL, "success"),
117+
("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL, "code"),
118+
("meta-llama-Llama-3.2-3B-Instruct", TEST_TOOL, "success"),
119+
("meta-llama-Llama-3.2-3B-Instruct", PYTHON_TOOL, "code"),
120+
("mistralai-Mistral-Nemo-Instruct-2407", TEST_TOOL, "success"),
121+
("mistralai-Mistral-Nemo-Instruct-2407", PYTHON_TOOL, "code"),
122+
("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", TEST_TOOL, "success"),
123+
("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", PYTHON_TOOL, "code"),
124+
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"),
125+
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"),
126+
("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"),
127+
("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
128+
])
129+
def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
130+
do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
131+
132+
120133
@pytest.mark.slow
121134
@pytest.mark.parametrize("tool,argument_key,hf_repo,hf_file,template_override", [
122135
(TEST_TOOL, "success", "lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", None),
@@ -154,7 +167,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
154167
if template_override:
155168
(template_hf_repo, template_variant) = template_override
156169
server.chat_template_file = f"../../../tests/chat/templates/{template_hf_repo.replace('/', '') + ('-' + template_variant if template_variant else '')}.jinja"
157-
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_hf_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
170+
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
158171
server.start()
159172
res = server.make_request("POST", "/chat/completions", data={
160173
"max_tokens": n_predict,
@@ -183,18 +196,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
183196
assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
184197

185198

186-
@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
187-
("meetkai-functionary-medium-v3.1", 128, [], None),
188-
("meetkai-functionary-medium-v3.1", 128, [TEST_TOOL], None),
189-
("meetkai-functionary-medium-v3.1", 128, [PYTHON_TOOL], 'none'),
190-
("meetkai-functionary-medium-v3.2", 128, [], None),
191-
("meetkai-functionary-medium-v3.2", 128, [TEST_TOOL], None),
192-
("meetkai-functionary-medium-v3.2", 128, [PYTHON_TOOL], 'none'),
193-
("meta-llama-Meta-Llama-3.1-8B-Instruct", 128, [], None),
194-
("meta-llama-Meta-Llama-3.1-8B-Instruct", 128, [TEST_TOOL], None),
195-
("meta-llama-Meta-Llama-3.1-8B-Instruct", 128, [PYTHON_TOOL], 'none'),
196-
])
197-
def test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
199+
def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
198200
global server
199201
server.jinja = True
200202
server.n_predict = n_predict
@@ -217,6 +219,31 @@ def test_completion_without_tool_call(template_name: str, n_predict: int, tools:
217219
assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
218220

219221

222+
@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
223+
("meta-llama-Llama-3.3-70B-Instruct", 128, [], None),
224+
("meta-llama-Llama-3.3-70B-Instruct", 128, [TEST_TOOL], None),
225+
("meta-llama-Llama-3.3-70B-Instruct", 128, [PYTHON_TOOL], 'none'),
226+
])
227+
def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
228+
do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
229+
230+
231+
@pytest.mark.slow
232+
@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
233+
("meetkai-functionary-medium-v3.1", 128, [], None),
234+
("meetkai-functionary-medium-v3.1", 128, [TEST_TOOL], None),
235+
("meetkai-functionary-medium-v3.1", 128, [PYTHON_TOOL], 'none'),
236+
("meetkai-functionary-medium-v3.2", 128, [], None),
237+
("meetkai-functionary-medium-v3.2", 128, [TEST_TOOL], None),
238+
("meetkai-functionary-medium-v3.2", 128, [PYTHON_TOOL], 'none'),
239+
("meta-llama-Llama-3.2-3B-Instruct", 128, [], None),
240+
("meta-llama-Llama-3.2-3B-Instruct", 128, [TEST_TOOL], None),
241+
("meta-llama-Llama-3.2-3B-Instruct", 128, [PYTHON_TOOL], 'none'),
242+
])
243+
def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
244+
do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
245+
246+
220247
@pytest.mark.slow
221248
@pytest.mark.parametrize("hf_repo,hf_file,template_override", [
222249
("lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", None),
@@ -243,7 +270,7 @@ def test_weather_tool_call(hf_repo: str, hf_file: str, template_override: Tuple[
243270
if template_override:
244271
(template_hf_repo, template_variant) = template_override
245272
server.chat_template_file = f"../../../tests/chat/templates/{template_hf_repo.replace('/', '') + ('-' + template_variant if template_variant else '')}.jinja"
246-
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_hf_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
273+
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
247274
server.start(timeout_seconds=15*60)
248275
res = server.make_request("POST", "/chat/completions", data={
249276
"max_tokens": 256,
@@ -292,7 +319,7 @@ def test_hello_world_tool_call(expected_arguments: str | None, hf_repo: str, hf_
292319
if template_override:
293320
(template_hf_repo, template_variant) = template_override
294321
server.chat_template_file = f"../../../tests/chat/templates/{template_hf_repo.replace('/', '') + ('-' + template_variant if template_variant else '')}.jinja"
295-
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_hf_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
322+
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
296323
server.start(timeout_seconds=15*60)
297324
res = server.make_request("POST", "/chat/completions", data={
298325
"max_tokens": 256,

examples/server/utils.hpp

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,11 @@ static json oaicompat_completion_params_parse(
596596
throw std::runtime_error("tools param requires --jinja flag");
597597
}
598598
}
599+
if (!use_jinja) {
600+
if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
601+
throw std::runtime_error("Unsupported param: tool_choice");
602+
}
603+
}
599604

600605
// Handle "stop" field
601606
if (body.contains("stop") && body.at("stop").is_string()) {
@@ -605,7 +610,6 @@ static json oaicompat_completion_params_parse(
605610
}
606611

607612
// Handle "response_format" field
608-
auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
609613
if (body.contains("response_format")) {
610614
json response_format = json_value(body, "response_format", json::object());
611615
std::string response_type = json_value(response_format, "type", std::string());
@@ -649,16 +653,6 @@ static json oaicompat_completion_params_parse(
649653
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
650654
}
651655

652-
// Params supported by OAI but unsupported by llama.cpp
653-
if (!use_jinja) {
654-
static const std::vector<std::string> unsupported_params { "tool_choice" };
655-
for (const auto & param : unsupported_params) {
656-
if (body.contains(param)) {
657-
throw std::runtime_error("Unsupported param: " + param);
658-
}
659-
}
660-
}
661-
662656
// Copy remaining properties to llama_params
663657
// This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
664658
// See "launch_slot_with_task()" for a complete list of params supported by llama.cpp

scripts/get_hf_chat_template.py renamed to scripts/get_chat_template.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,20 @@
44
If a model has multiple chat templates, you can specify the variant name.
55
66
Syntax:
7-
./scripts/get_hf_chat_template.py model_id [variant]
7+
./scripts/get_chat_template.py model_id [variant]
88
99
Examples:
10-
./scripts/get_hf_chat_template.py NousResearch/Meta-Llama-3-8B-Instruct
11-
./scripts/get_hf_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use
12-
./scripts/get_hf_chat_template.py meta-llama/Llama-3.2-3B-Instruct
10+
./scripts/get_chat_template.py NousResearch/Meta-Llama-3-8B-Instruct
11+
./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use
12+
./scripts/get_chat_template.py meta-llama/Llama-3.2-3B-Instruct
1313
'''
1414

1515
import json
1616
import re
1717
import sys
1818

1919

20-
def get_hf_chat_template(model_id, variant=None):
20+
def get_chat_template(model_id, variant=None):
2121
try:
2222
# Use huggingface_hub library if available.
2323
# Allows access to gated models if the user has access and ran `huggingface-cli login`.
@@ -69,9 +69,10 @@ def main(args):
6969
model_id = args[0]
7070
variant = None if len(args) < 2 else args[1]
7171

72-
template = get_hf_chat_template(model_id, variant)
72+
template = get_chat_template(model_id, variant)
7373
sys.stdout.write(template)
7474

7575

7676
if __name__ == '__main__':
7777
main(sys.argv[1:])
78+

src/llama-grammar.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ bool llama_grammar_parser::parse(const char * src) {
560560
}
561561
}
562562
} catch (const std::exception & err) {
563-
fprintf(stderr, "\n%s: error parsing grammar: %s\n\n%s\n", __func__, err.what(), src);
563+
fprintf(stderr, "%s: error parsing grammar: %s\n\n%s\n", __func__, err.what(), src);
564564
rules.clear();
565565
return false;
566566
}

src/llama-grammar.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,8 @@ struct llama_grammar {
118118
// lazy grammars wait for trigger words or tokens before constraining the sampling.
119119
// we still ahve trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
120120
// (useful e.g. for tool_choice=required)
121-
bool lazy; // Useful when resetting
122-
bool awaiting_trigger; // Initialized to lazy
121+
bool lazy;
122+
bool awaiting_trigger; // Initialized to true for lazy grammars only
123123
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
124124
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
125125
std::vector<std::string> trigger_words;

tests/test-chat-handler.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,6 @@ struct delta_data {
169169
};
170170

171171
static delta_data init_delta(const common_chat_template & tmpl, const std::vector<std::string> & end_tokens, const json & user_message, const json & delta_message, const json & tools) {
172-
fprintf(stderr, "Template source: %s\n", tmpl.source().c_str());
173-
fprintf(stderr, "Delta message: %s\n", delta_message.dump(2).c_str());
174-
175172
common_chat_params params;
176173
params.parallel_tool_calls = true;
177174
params.messages = json::array();
@@ -209,12 +206,14 @@ static delta_data init_delta(const common_chat_template & tmpl, const std::vecto
209206
return {delta, full_data.grammar, full_data.parser};
210207
}
211208

209+
/*
210+
Applies the template to 1 user message w/ add_generation_prompt=true, then w/ the test message w/ add_generation_prompt=false,
211+
gets the diff, removes any end tokens and parses the result w/ the grammar, checking that
212+
the parsed message is the same as the test_message
213+
*/
212214
static void test_template(const common_chat_template & tmpl, const std::vector<std::string> & end_tokens, const json & test_message, const json & tools = {}, const std::string & expected_delta = "", bool skip_grammar_test = false, bool skip_parser_test = false) {
213-
// auto tool_call_style = common_tool_call_style_detect(tmpl);
214215
common_chat_msg expected_msg = msg_from_json(test_message);
215216

216-
// Format the message: apply the template to 1 user message w/ add_generation_prompt=true, then w/ the extra message w/ add_generation_prompt=false,
217-
// get the diff and try and parse it w/ the grammar.
218217
auto user_message = json {
219218
{"role", "user"},
220219
{"content", "Hello, world!"}
@@ -228,7 +227,6 @@ static void test_template(const common_chat_template & tmpl, const std::vector<s
228227
params.tools = tools;
229228

230229
auto data = init_delta(tmpl, end_tokens, user_message, test_message, tools);
231-
std::cout << "Full delta:\n```\n" << data.delta << "\n```" << std::endl;
232230
if (!expected_delta.empty()) {
233231
assert_equals(expected_delta, data.delta);
234232
}
@@ -449,7 +447,6 @@ static void test_template_output_parsers() {
449447
}
450448

451449
int main() {
452-
// test_parsing();
453450
test_template_output_parsers();
454451

455452
std::cout << "\n[tool-call] All tests passed!" << std::endl;

0 commit comments

Comments
 (0)