Skip to content

Commit 79b0389

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents 633e743 + 2fbe3b7 commit 79b0389

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+3570
-541
lines changed

.github/workflows/release.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,8 @@ jobs:
546546
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
547547
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
548548
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
549+
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
550+
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
549551
550552
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
551553
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ The project differentiates between 3 levels of contributors:
1515
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
1616
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
1717
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
18+
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
1819
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
1920
- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
2021
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR

common/chat-parser-xml-toolcall.cpp

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -724,16 +724,10 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
724724
if (reasoning_unclosed) {
725725
if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
726726
unclosed_reasoning_content += content;
727-
if (form.allow_toolcall_in_think) {
728-
builder.move_to(tc->groups[0].begin);
729-
if (!builder.try_consume_xml_tool_calls(form)) {
730-
unclosed_reasoning_content += tool_call_start;
731-
builder.move_to(tc->groups[0].end);
732-
}
733-
} else {
727+
if (!(form.allow_toolcall_in_think && tc)) {
734728
unclosed_reasoning_content += tool_call_start;
729+
continue;
735730
}
736-
continue;
737731
} else {
738732
reasoning_unclosed = false;
739733
std::string reasoning_content;
@@ -781,8 +775,12 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
781775
}
782776
} else {
783777
// This <tool_call> start is in thinking block, skip this tool call
784-
auto pos = think_start + start_think.size();
785-
unclosed_reasoning_content = content.substr(pos) + tool_call_start;
778+
// This <tool_call> start is in thinking block
779+
if (form.allow_toolcall_in_think) {
780+
unclosed_reasoning_content = content.substr(think_start + start_think.size());
781+
} else {
782+
unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
783+
}
786784
reasoning_unclosed = true;
787785
content.resize(think_start);
788786
toolcall_in_think = true;
@@ -805,22 +803,43 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
805803
}
806804

807805
// remove potential partial suffix
808-
if (content.size() > 0 && builder.pos() == builder.input().size() && unclosed_reasoning_content.empty()) {
809-
rstrip(content);
810-
trim_potential_partial_word(content);
811-
rstrip(content);
806+
if (builder.pos() == builder.input().size()) {
807+
if (unclosed_reasoning_content.empty()) {
808+
rstrip(content);
809+
trim_potential_partial_word(content);
810+
rstrip(content);
811+
} else {
812+
rstrip(unclosed_reasoning_content);
813+
trim_potential_partial_word(unclosed_reasoning_content);
814+
rstrip(unclosed_reasoning_content);
815+
}
816+
}
817+
818+
// consume unclosed_reasoning_content if allow_toolcall_in_think is set
819+
if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
820+
if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
821+
builder.add_reasoning_content(unclosed_reasoning_content);
822+
} else {
823+
if (content.empty()) {
824+
content = start_think + unclosed_reasoning_content;
825+
} else {
826+
content += "\n\n" + start_think;
827+
content += unclosed_reasoning_content;
828+
}
829+
}
830+
unclosed_reasoning_content.clear();
812831
}
813832

814833
// Add content
815-
if (content.size() != 0) {
834+
if (!content.empty()) {
816835
// If there are multiple content blocks
817836
if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
818837
builder.add_content("\n\n");
819838
}
820839
builder.add_content(content);
821840
}
822841

823-
// This <tool_call> start is in thinking block, skip this tool call
842+
// This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
824843
if (toolcall_in_think && !form.allow_toolcall_in_think) {
825844
continue;
826845
}
@@ -829,7 +848,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
829848
if (!tc) {
830849
GGML_ASSERT(builder.pos() == builder.input().size());
831850
GGML_ASSERT(unclosed_reasoning_content.empty());
832-
GGML_ASSERT(!reasoning_unclosed);
851+
if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
833852
break;
834853
}
835854

@@ -854,7 +873,6 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
854873

855874
/**
856875
* Parse content uses reasoning and XML-Style tool call
857-
* TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
858876
*/
859877
void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
860878
parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);

common/chat-parser-xml-toolcall.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ struct xml_tool_call_format {
3131
std::optional<std::string> last_val_end = std::nullopt;
3232
std::optional<std::string> last_tool_end = std::nullopt;
3333
bool trim_raw_argval = false;
34-
bool allow_toolcall_in_think = false; // TODO: UNTESTED!!!
34+
bool allow_toolcall_in_think = false;
3535
};
3636

3737
// make a GBNF that accept any strings except those containing any of the forbidden strings.

common/chat-parser.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -917,12 +917,13 @@ static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
917917
form.tool_start = "<|tool_call_begin|>";
918918
form.tool_sep = "<|tool_call_argument_begin|>{";
919919
form.key_start = "\"";
920-
form.key_val_sep = "\": ";
921-
form.val_end = ", ";
920+
form.key_val_sep = "\":";
921+
form.val_end = ",";
922922
form.tool_end = "}<|tool_call_end|>";
923923
form.scope_end = "<|tool_calls_section_end|>";
924924
form.raw_argval = false;
925925
form.last_val_end = "";
926+
form.allow_toolcall_in_think = true;
926927
return form;
927928
})();
928929
builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");

common/chat.cpp

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "chat.h"
22
#include "chat-parser.h"
3+
#include "chat-peg-parser.h"
34
#include "common.h"
45
#include "json-partial.h"
56
#include "json-schema-to-grammar.h"
@@ -150,6 +151,7 @@ struct templates_params {
150151
common_chat_tool_choice tool_choice;
151152
json json_schema;
152153
bool parallel_tool_calls;
154+
common_reasoning_format reasoning_format;
153155
bool stream;
154156
std::string grammar;
155157
bool add_generation_prompt = true;
@@ -589,6 +591,16 @@ common_chat_templates_ptr common_chat_templates_init(
589591
"{%- if false %}");
590592
}
591593

594+
// TODO @aldehir : this is a temporary fix, pending Minja changes
595+
// Ref: https://github.com/ggml-org/llama.cpp/pull/17713#issuecomment-3631342664
596+
if (default_template_src.find("[TOOL_CALLS]") != std::string::npos
597+
// search for the error message and patch it
598+
&& default_template_src.find("if (message['content'] is none or") != std::string::npos) {
599+
string_replace_all(default_template_src,
600+
"{%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}",
601+
"{%- if false %}");
602+
}
603+
592604
std::string token_bos = bos_token_override;
593605
std::string token_eos = eos_token_override;
594606
bool add_bos = false;
@@ -987,6 +999,118 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
987999
return data;
9881000
}
9891001

1002+
static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl, const struct templates_params & inputs) {
1003+
common_chat_params data;
1004+
1005+
// Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
1006+
auto adjusted_messages = json::array();
1007+
for (const auto & msg : inputs.messages) {
1008+
auto role = msg.value("role", "");
1009+
if (role != "system" && role != "assistant") {
1010+
// Only adjust system and assistant messages. Interestingly, the system message may contain thinking.
1011+
adjusted_messages.push_back(msg);
1012+
continue;
1013+
}
1014+
1015+
auto content = json::array();
1016+
1017+
// If message contains `reasoning_content`, add it as a block of type `thinking`
1018+
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
1019+
content.push_back({
1020+
{"type", "thinking"},
1021+
{"thinking", msg.at("reasoning_content").get<std::string>()},
1022+
});
1023+
}
1024+
1025+
// If message contains `content`, add it as a block of type `text`
1026+
if (msg.contains("content")) {
1027+
if (msg.at("content").is_string()) {
1028+
content.push_back({
1029+
{"type", "text"},
1030+
{"text", msg.at("content").get<std::string>()},
1031+
});
1032+
} else if (msg.at("content").is_array()) {
1033+
auto blocks = msg.at("content");
1034+
content.insert(content.end(), blocks.begin(), blocks.end());
1035+
}
1036+
}
1037+
1038+
auto adjusted = msg;
1039+
adjusted["content"] = content;
1040+
adjusted.erase("reasoning_content");
1041+
adjusted_messages.push_back(adjusted);
1042+
}
1043+
1044+
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
1045+
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
1046+
auto include_grammar = true;
1047+
1048+
data.prompt = apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
1049+
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
1050+
data.preserved_tokens = {
1051+
"[THINK]",
1052+
"[/THINK]",
1053+
"[TOOL_CALLS]",
1054+
"[ARGS]",
1055+
};
1056+
1057+
auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
1058+
auto reasoning = extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
1059+
1060+
// Response format parser
1061+
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
1062+
// Ministral wants to emit json surrounded by code fences
1063+
return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```";
1064+
}
1065+
1066+
// Tool call parser
1067+
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
1068+
auto tool_choice = p.choice();
1069+
foreach_function(inputs.tools, [&](const json & tool) {
1070+
const auto & function = tool.at("function");
1071+
std::string name = function.at("name");
1072+
const auto & schema = function.at("parameters");
1073+
1074+
tool_choice |= p.rule("tool-" + name,
1075+
p.tool_open(p.tool_name(p.literal(name)) + "[ARGS]")
1076+
+ p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
1077+
);
1078+
});
1079+
1080+
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
1081+
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
1082+
auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
1083+
1084+
return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
1085+
}
1086+
1087+
// Content only parser
1088+
include_grammar = false;
1089+
return reasoning << p.content(p.rest());
1090+
});
1091+
1092+
data.parser = parser.save();
1093+
1094+
if (include_grammar) {
1095+
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
1096+
1097+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1098+
foreach_function(inputs.tools, [&](const json & tool) {
1099+
const auto & function = tool.at("function");
1100+
auto schema = function.at("parameters");
1101+
builder.resolve_refs(schema);
1102+
});
1103+
parser.build_grammar(builder, data.grammar_lazy);
1104+
});
1105+
1106+
data.grammar_triggers = {
1107+
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}
1108+
};
1109+
}
1110+
1111+
return data;
1112+
}
1113+
9901114
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
9911115
common_chat_params data;
9921116
data.prompt = apply(tmpl, inputs);
@@ -2341,6 +2465,7 @@ static common_chat_params common_chat_templates_apply_jinja(
23412465
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
23422466
params.add_generation_prompt = inputs.add_generation_prompt;
23432467
params.tool_choice = inputs.tool_choice;
2468+
params.reasoning_format = inputs.reasoning_format;
23442469
params.enable_thinking = inputs.enable_thinking;
23452470
params.grammar = inputs.grammar;
23462471
params.now = inputs.now;
@@ -2504,6 +2629,13 @@ static common_chat_params common_chat_templates_apply_jinja(
25042629
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
25052630
}
25062631

2632+
// Ministral/Mistral Large 3
2633+
if (src.find("[SYSTEM_PROMPT]") != std::string::npos &&
2634+
src.find("[TOOL_CALLS]") != std::string::npos &&
2635+
src.find("[ARGS]") != std::string::npos) {
2636+
return common_chat_params_init_ministral_3(tmpl, params);
2637+
}
2638+
25072639
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
25082640
return common_chat_params_init_magistral(tmpl, params);
25092641
}

0 commit comments

Comments
 (0)