Skip to content

Commit 74dcf89

Browse files
authored
Merge branch 'ggml-org:master' into apertus-implementation
2 parents ffdfd1d + a972fae commit 74dcf89

File tree

118 files changed

+5387
-3114
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

118 files changed

+5387
-3114
lines changed

.github/workflows/close-issue.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
steps:
1818
- uses: actions/stale@v5
1919
with:
20-
exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
20+
exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap"
2121
days-before-issue-stale: 30
2222
days-before-issue-close: 14
2323
stale-issue-label: "stale"

CONTRIBUTING.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
1717
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
1818
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
19+
- Let authors, who are also collaborators, merge their own PRs
20+
- When merging a PR by a contributor, make sure you have a good understanding of the changes
21+
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
1922

2023
# Coding guidelines
2124

common/arg.cpp

Lines changed: 42 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,6 +1263,18 @@ static std::string list_builtin_chat_templates() {
12631263
return msg.str();
12641264
}
12651265

1266+
static bool is_truthy(const std::string & value) {
1267+
return value == "on" || value == "enabled" || value == "1";
1268+
}
1269+
1270+
static bool is_falsey(const std::string & value) {
1271+
return value == "off" || value == "disabled" || value == "0";
1272+
}
1273+
1274+
static bool is_autoy(const std::string & value) {
1275+
return value == "auto" || value == "-1";
1276+
}
1277+
12661278
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
12671279
// load dynamic backends
12681280
ggml_backend_load_all();
@@ -1544,21 +1556,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15441556
params.n_chunks = value;
15451557
}
15461558
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1547-
add_opt(common_arg(
1548-
{"-fa", "--flash-attn"}, "FA",
1549-
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
1550-
[](common_params & params, const std::string & value) {
1551-
if (value == "on" || value == "enabled" || value == "1") {
1552-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1553-
} else if (value == "off" || value == "disabled" || value == "0") {
1554-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1555-
} else if (value == "auto" || value == "-1") {
1556-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1557-
} else {
1558-
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1559-
}
1560-
}
1561-
).set_env("LLAMA_ARG_FLASH_ATTN"));
1559+
add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
1560+
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
1561+
llama_flash_attn_type_name(params.flash_attn_type)),
1562+
[](common_params & params, const std::string & value) {
1563+
if (is_truthy(value)) {
1564+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1565+
} else if (is_falsey(value)) {
1566+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1567+
} else if (is_autoy(value)) {
1568+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1569+
} else {
1570+
throw std::runtime_error(
1571+
string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1572+
}
1573+
}).set_env("LLAMA_ARG_FLASH_ATTN"));
15621574
add_opt(common_arg(
15631575
{"-p", "--prompt"}, "PROMPT",
15641576
"prompt to start generation with; for system message, use -sys",
@@ -3134,13 +3146,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31343146
common_log_set_file(common_log_main(), value.c_str());
31353147
}
31363148
));
3137-
add_opt(common_arg(
3138-
{"--log-colors"},
3139-
"Enable colored logging",
3140-
[](common_params &) {
3141-
common_log_set_colors(common_log_main(), true);
3142-
}
3143-
).set_env("LLAMA_LOG_COLORS"));
3149+
add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
3150+
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3151+
"'auto' enables colors when output is to a terminal",
3152+
[](common_params &, const std::string & value) {
3153+
if (is_truthy(value)) {
3154+
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3155+
} else if (is_falsey(value)) {
3156+
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3157+
} else if (is_autoy(value)) {
3158+
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3159+
} else {
3160+
throw std::invalid_argument(
3161+
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3162+
}
3163+
}).set_env("LLAMA_LOG_COLORS"));
31443164
add_opt(common_arg(
31453165
{"-v", "--verbose", "--log-verbose"},
31463166
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",

common/chat.cpp

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
163163
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
164164
}
165165

166+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
167+
common_chat_templates_inputs dummy_inputs;
168+
common_chat_msg msg;
169+
msg.role = "user";
170+
msg.content = "test";
171+
dummy_inputs.messages = {msg};
172+
dummy_inputs.enable_thinking = false;
173+
const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
174+
dummy_inputs.enable_thinking = true;
175+
const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
176+
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
177+
}
178+
166179
template <>
167180
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
168181
std::vector<common_chat_msg> msgs;
@@ -618,6 +631,7 @@ const char * common_chat_format_name(common_chat_format format) {
618631
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
619632
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
620633
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
634+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
621635
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
622636
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
623637
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
@@ -685,11 +699,13 @@ static void parse_json_tool_calls(
685699
size_t from = std::string::npos;
686700
auto first = true;
687701
while (true) {
702+
auto start_pos = builder.pos();
688703
auto res = function_regex_start_only && first
689704
? builder.try_consume_regex(*function_regex_start_only)
690705
: function_regex
691706
? builder.try_find_regex(*function_regex, from)
692707
: std::nullopt;
708+
693709
if (res) {
694710
std::string name;
695711
if (get_function_name) {
@@ -724,6 +740,8 @@ static void parse_json_tool_calls(
724740
return;
725741
}
726742
throw common_chat_msg_partial_exception("incomplete tool call");
743+
} else {
744+
builder.move_to(start_pos);
727745
}
728746
break;
729747
}
@@ -1375,6 +1393,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
13751393
}
13761394
return data;
13771395
}
1396+
1397+
static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1398+
common_chat_params data;
1399+
1400+
// Pass thinking context for DeepSeek V3.1 template
1401+
json additional_context = {
1402+
{"thinking", inputs.enable_thinking},
1403+
};
1404+
1405+
auto prompt = apply(tmpl, inputs,
1406+
/* messages_override= */ inputs.messages,
1407+
/* tools_override= */ std::nullopt,
1408+
additional_context);
1409+
data.prompt = prompt;
1410+
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
1411+
if (string_ends_with(data.prompt, "<think>")) {
1412+
if (!inputs.enable_thinking) {
1413+
data.prompt += "</think>";
1414+
} else {
1415+
data.thinking_forced_open = true;
1416+
}
1417+
}
1418+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
1419+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
1420+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1421+
std::vector<std::string> tool_rules;
1422+
foreach_function(inputs.tools, [&](const json & tool) {
1423+
const auto & function = tool.at("function");
1424+
std::string name = function.at("name");
1425+
auto parameters = function.at("parameters");
1426+
builder.resolve_refs(parameters);
1427+
tool_rules.push_back(builder.add_rule(name + "-call",
1428+
"( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
1429+
"\" " + builder.add_schema(name + "-args", parameters) + " "
1430+
"\"<|tool▁call▁end|>\""));
1431+
});
1432+
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
1433+
// so we accept common variants (then it's all constrained)
1434+
builder.add_rule("root",
1435+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1436+
"( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
1437+
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
1438+
"\"<|tool▁calls▁end|>\""
1439+
" space");
1440+
data.grammar_triggers.push_back({
1441+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1442+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1443+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1444+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
1445+
"(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
1446+
});
1447+
data.preserved_tokens = {
1448+
"<think>",
1449+
"</think>",
1450+
"<|tool▁calls▁begin|>",
1451+
"<|tool▁call▁begin|>",
1452+
"<|tool▁sep|>",
1453+
"<|tool▁call▁end|>",
1454+
"<|tool▁calls▁end|>",
1455+
};
1456+
});
1457+
}
1458+
return data;
1459+
}
1460+
13781461
static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
13791462
builder.try_parse_reasoning("<think>", "</think>");
13801463
if (!builder.syntax().parse_tool_calls) {
@@ -1396,6 +1479,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
13961479
tool_calls_end);
13971480
}
13981481

1482+
static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
1483+
static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
1484+
1485+
static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
1486+
static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
1487+
static const common_regex tool_calls_end("<|tool▁calls▁end|>");
1488+
1489+
if (!builder.syntax().parse_tool_calls) {
1490+
LOG_DBG("%s: not parse_tool_calls\n", __func__);
1491+
builder.add_content(builder.consume_rest());
1492+
return;
1493+
}
1494+
1495+
LOG_DBG("%s: parse_tool_calls\n", __func__);
1496+
1497+
parse_json_tool_calls(
1498+
builder,
1499+
/* block_open= */ tool_calls_begin,
1500+
/* function_regex_start_only= */ std::nullopt,
1501+
function_regex,
1502+
close_regex,
1503+
tool_calls_end);
1504+
}
1505+
1506+
static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
1507+
// DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
1508+
// First try to parse using the standard reasoning parsing method
1509+
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
1510+
1511+
auto start_pos = builder.pos();
1512+
auto found_end_think = builder.try_find_literal("</think>");
1513+
builder.move_to(start_pos);
1514+
1515+
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
1516+
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
1517+
common_chat_parse_deepseek_v3_1_content(builder);
1518+
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
1519+
// If reasoning was parsed successfully, the remaining content is regular content
1520+
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
1521+
// </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
1522+
common_chat_parse_deepseek_v3_1_content(builder);
1523+
} else {
1524+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1525+
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
1526+
common_chat_parse_deepseek_v3_1_content(builder);
1527+
return;
1528+
}
1529+
// If no reasoning tags found, check if we should treat everything as reasoning
1530+
if (builder.syntax().thinking_forced_open) {
1531+
// If thinking is forced open but no tags found, treat everything as reasoning
1532+
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
1533+
builder.add_reasoning_content(builder.consume_rest());
1534+
} else {
1535+
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
1536+
// <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
1537+
common_chat_parse_deepseek_v3_1_content(builder);
1538+
}
1539+
}
1540+
}
1541+
13991542
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
14001543
common_chat_params data;
14011544
auto prompt = apply(tmpl, inputs);
@@ -2352,6 +2495,12 @@ static common_chat_params common_chat_templates_apply_jinja(
23522495
}
23532496
}
23542497

2498+
// DeepSeek V3.1: detect based on specific patterns in the template
2499+
if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
2500+
params.json_schema.is_null()) {
2501+
return common_chat_params_init_deepseek_v3_1(tmpl, params);
2502+
}
2503+
23552504
// DeepSeek R1: use handler in all cases except json schema (thinking / tools).
23562505
if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
23572506
return common_chat_params_init_deepseek_r1(tmpl, params);
@@ -2524,6 +2673,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
25242673
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
25252674
common_chat_parse_deepseek_r1(builder);
25262675
break;
2676+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
2677+
common_chat_parse_deepseek_v3_1(builder);
2678+
break;
25272679
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
25282680
common_chat_parse_functionary_v3_2(builder);
25292681
break;

common/chat.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ enum common_chat_format {
107107
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
108108
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
109109
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
110+
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
110111
COMMON_CHAT_FORMAT_HERMES_2_PRO,
111112
COMMON_CHAT_FORMAT_COMMAND_R7B,
112113
COMMON_CHAT_FORMAT_GRANITE,
@@ -199,6 +200,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
199200

200201
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
201202

203+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
204+
202205
// Parses a JSON array of messages in OpenAI's chat completion API format.
203206
// T can be std::string containing JSON or nlohmann::ordered_json
204207
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);

common/json-schema-to-grammar.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -843,9 +843,10 @@ class SchemaConverter {
843843
_build_object_rule(
844844
properties, required, name,
845845
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
846-
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
846+
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
847847
std::unordered_set<std::string> required;
848848
std::vector<std::pair<std::string, json>> properties;
849+
std::map<std::string, size_t> enum_values;
849850
std::string hybrid_name = name;
850851
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
851852
if (comp_schema.contains("$ref")) {
@@ -857,6 +858,14 @@ class SchemaConverter {
857858
required.insert(prop.key());
858859
}
859860
}
861+
} else if (comp_schema.contains("enum")) {
862+
for (const auto & v : comp_schema["enum"]) {
863+
const auto rule = _generate_constant_rule(v);
864+
if (enum_values.find(rule) == enum_values.end()) {
865+
enum_values[rule] = 0;
866+
}
867+
enum_values[rule] += 1;
868+
}
860869
} else {
861870
// todo warning
862871
}
@@ -870,6 +879,17 @@ class SchemaConverter {
870879
add_component(t, true);
871880
}
872881
}
882+
if (!enum_values.empty()) {
883+
std::vector<std::string> enum_intersection;
884+
for (const auto & p : enum_values) {
885+
if (p.second == schema["allOf"].size()) {
886+
enum_intersection.push_back(p.first);
887+
}
888+
}
889+
if (!enum_intersection.empty()) {
890+
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
891+
}
892+
}
873893
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
874894
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
875895
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];

0 commit comments

Comments
 (0)