Skip to content

Commit 63f8fe0

Browse files
authored
model, mtmd: fix gguf conversion for audio/vision mmproj (#21309)
* fix gguf conversion for audio/vision mmproj * fix test
1 parent 2233737 commit 63f8fe0

27 files changed

+1462
-41
lines changed

common/chat-auto-parser-generator.cpp

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,8 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
169169
return build_tool_parser_tag_json(ctx);
170170
case tool_format::TAG_WITH_TAGGED:
171171
return build_tool_parser_tag_tagged(ctx);
172+
case tool_format::TAG_WITH_GEMMA4_DICT:
173+
return build_tool_parser_tag_gemma4_dict(ctx);
172174
default:
173175
LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
174176
"Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
@@ -433,4 +435,113 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
433435
p.end();
434436
}
435437

438+
common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const {
439+
auto & p = ctx.p;
440+
const auto & inputs = ctx.inputs;
441+
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
442+
443+
// The Gemma4 string quote token used in place of JSON "
444+
static const std::string QUOTE = "<|\"|>";
445+
446+
common_peg_parser tool_choice = p.choice();
447+
448+
foreach_function(inputs.tools, [&](const json & tool) {
449+
const auto & func = tool.at("function");
450+
std::string name = func.at("name");
451+
const auto & params = func.at("parameters");
452+
453+
if (!params.contains("properties") || !params.at("properties").is_object()) {
454+
// No arguments - just match the function name with empty braces
455+
auto func_parser = p.atomic(
456+
p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
457+
p.tool_args(p.eps()) +
458+
p.tool_close(p.literal("}")));
459+
tool_choice |= p.rule("tool-" + name, func_parser);
460+
return;
461+
}
462+
463+
const auto & properties = params.at("properties");
464+
std::set<std::string> required;
465+
if (params.contains("required") && params.at("required").is_array()) {
466+
params.at("required").get_to(required);
467+
}
468+
469+
// Build per-argument parsers, sorted alphabetically (matching template's dictsort)
470+
struct arg_entry {
471+
std::string param_name;
472+
common_peg_parser parser;
473+
};
474+
std::vector<arg_entry> arg_entries;
475+
476+
for (const auto & [param_name, param_schema] : properties.items()) {
477+
std::string type = "object";
478+
auto type_v = param_schema.contains("type") ? param_schema.at("type") : json::object();
479+
if (type_v.is_string()) type_v.get_to(type);
480+
481+
common_peg_parser value_parser = p.eps();
482+
if (type == "string") {
483+
// String values are delimited by <|"|>...<|"|>
484+
value_parser =
485+
p.literal(QUOTE) +
486+
p.tool_arg_string_value(p.schema(p.until(QUOTE),
487+
"tool-" + name + "-arg-" + param_name + "-schema", param_schema, true)) +
488+
p.literal(QUOTE);
489+
} else {
490+
// Numbers, booleans: raw text up to the next comma or closing brace
491+
value_parser = p.tool_arg_value(p.until_one_of({",", "}"}));
492+
}
493+
494+
auto arg = p.tool_arg(
495+
p.tool_arg_open(p.tool_arg_name(p.literal(param_name)) + p.literal(":")) +
496+
value_parser +
497+
p.tool_arg_close(p.eps()));
498+
499+
arg_entries.push_back({param_name, p.rule("tool-" + name + "-arg-" + param_name, arg)});
500+
}
501+
502+
// Sort alphabetically to match Jinja's dictsort
503+
std::sort(arg_entries.begin(), arg_entries.end(), [](const auto & a, const auto & b) {
504+
return a.param_name < b.param_name;
505+
});
506+
507+
// Build arg sequence: any arg, then zero-or-more comma-separated additional args
508+
common_peg_parser args_seq = p.eps();
509+
if (!arg_entries.empty()) {
510+
common_peg_parser any_arg = p.choice();
511+
for (auto & entry : arg_entries) {
512+
any_arg |= entry.parser;
513+
}
514+
args_seq = p.optional(
515+
any_arg + p.repeat(p.literal(",") + any_arg, 0, (int) arg_entries.size() - 1));
516+
}
517+
518+
// Full parser: call:name{args}
519+
auto func_parser = p.atomic(
520+
p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
521+
p.tool_args(args_seq) +
522+
p.tool_close(p.literal("}")));
523+
524+
tool_choice |= p.rule("tool-" + name, func_parser);
525+
});
526+
527+
// Wrap each call in <|tool_call>...</tool_call|>
528+
auto wrapped_call = p.literal(format.per_call_start) + tool_choice + p.literal(format.per_call_end);
529+
530+
common_peg_parser tool_calls = p.eps();
531+
if (inputs.parallel_tool_calls) {
532+
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
533+
} else {
534+
tool_calls = p.trigger_rule("tool-call", wrapped_call);
535+
}
536+
537+
if (!force_tools) {
538+
tool_calls = p.optional(tool_calls);
539+
}
540+
541+
auto content_before_tools = p.until(format.per_call_start);
542+
return ctx.reasoning_parser +
543+
(force_tools ? p.eps() : p.optional(p.content(content_before_tools))) +
544+
tool_calls + p.end();
545+
}
546+
436547
} // namespace autoparser

common/chat-auto-parser.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ enum class tool_format {
144144
JSON_NATIVE, // Pure JSON: {"name": "X", "arguments": {...}}
145145
TAG_WITH_JSON, // Tag-based with JSON args: <function=X>{...}</function>
146146
TAG_WITH_TAGGED, // Tag-based with tagged args: <param=key>value</param>
147+
TAG_WITH_GEMMA4_DICT, // Gemma4 custom dict: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
147148
};
148149

149150
inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
@@ -156,6 +157,8 @@ inline std::ostream & operator<<(std::ostream & os, const tool_format & format)
156157
return os << "TAG_WITH_JSON";
157158
case tool_format::TAG_WITH_TAGGED:
158159
return os << "TAG_WITH_TAGGED";
160+
case tool_format::TAG_WITH_GEMMA4_DICT:
161+
return os << "TAG_WITH_GEMMA4_DICT";
159162
default:
160163
return os << "UNKNOWN";
161164
}
@@ -350,6 +353,7 @@ struct analyze_tools : analyze_base {
350353
common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
351354
common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
352355
common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
356+
common_peg_parser build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const;
353357
};
354358

355359
// ============================================================================

common/chat-diff-analyzer.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,33 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
9292
LOG_DBG(ANSI_ORANGE "[Patch: Functionary 3.1]\n" ANSI_RESET);
9393
}
9494
},
95+
// Gemma4 - custom dict format: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
96+
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
97+
if (tmpl.src.find("'<|tool_call>call:'") != std::string::npos) {
98+
analysis.tools.format.mode = tool_format::TAG_WITH_GEMMA4_DICT;
99+
analysis.tools.format.per_call_start = "<|tool_call>";
100+
analysis.tools.format.per_call_end = "<tool_call|>";
101+
analysis.tools.format.section_start = "";
102+
analysis.tools.format.section_end = "";
103+
analysis.tools.function.name_prefix = "call:";
104+
analysis.tools.function.name_suffix = "";
105+
analysis.tools.arguments.start = "{";
106+
analysis.tools.arguments.end = "}";
107+
analysis.tools.arguments.name_suffix = ":";
108+
analysis.tools.arguments.separator = ",";
109+
analysis.reasoning.mode = reasoning_mode::TAG_BASED;
110+
analysis.reasoning.start = "<|channel>thought\n";
111+
analysis.reasoning.end = "<channel|>";
112+
analysis.preserved_tokens.clear();
113+
analysis.preserved_tokens.push_back("<|tool_call>");
114+
analysis.preserved_tokens.push_back("<tool_call|>");
115+
analysis.preserved_tokens.push_back("<|tool_response>");
116+
analysis.preserved_tokens.push_back("<tool_response|>");
117+
analysis.preserved_tokens.push_back("<|\"|>");
118+
analysis.preserved_tokens.push_back("<|turn>");
119+
LOG_DBG(ANSI_ORANGE "[Patch: Gemma4]\n" ANSI_RESET);
120+
}
121+
},
95122
// DeepSeek-R1-Distill-Qwen
96123
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
97124
if (tmpl.src.find(

common/chat.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1545,6 +1545,50 @@ static void requires_non_null_content(json & messages) {
15451545
}
15461546
}
15471547

1548+
// Gemma4 uses a custom tool_responses field instead of role:tool messages.
1549+
// Convert consecutive role:tool messages into a single user message with tool_responses.
1550+
static void convert_tool_responses_gemma4(json & messages) {
1551+
json result = json::array();
1552+
size_t i = 0;
1553+
while (i < messages.size()) {
1554+
if (messages[i].contains("role") && messages[i].at("role") == "tool") {
1555+
json tool_responses = json::array();
1556+
while (i < messages.size() &&
1557+
messages[i].contains("role") &&
1558+
messages[i].at("role") == "tool") {
1559+
const auto & tool_msg = messages[i];
1560+
std::string name;
1561+
if (tool_msg.contains("tool_call_id") && tool_msg.at("tool_call_id").is_string()) {
1562+
name = tool_msg.at("tool_call_id");
1563+
} else if (tool_msg.contains("name") && tool_msg.at("name").is_string()) {
1564+
name = tool_msg.at("name");
1565+
}
1566+
json response;
1567+
if (tool_msg.contains("content")) {
1568+
const auto & content = tool_msg.at("content");
1569+
if (content.is_string()) {
1570+
// Try to parse the content as JSON; fall back to raw string
1571+
try {
1572+
response = json::parse(content.get<std::string>());
1573+
} catch (...) {
1574+
response = content;
1575+
}
1576+
} else {
1577+
response = content;
1578+
}
1579+
}
1580+
tool_responses.push_back({{"name", name}, {"response", response}});
1581+
i++;
1582+
}
1583+
result.push_back({{"role", "user"}, {"tool_responses", tool_responses}});
1584+
} else {
1585+
result.push_back(messages[i]);
1586+
i++;
1587+
}
1588+
}
1589+
messages = result;
1590+
}
1591+
15481592
static void func_args_not_string(json & messages) {
15491593
GGML_ASSERT(messages.is_array());
15501594
for (auto & message : messages) {
@@ -1673,6 +1717,10 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
16731717
workaround::func_args_not_string(params.messages);
16741718
}
16751719

1720+
if (src.find("'<|tool_call>call:'") != std::string::npos) {
1721+
workaround::convert_tool_responses_gemma4(params.messages);
1722+
}
1723+
16761724
params.add_generation_prompt = false;
16771725
std::string no_gen_prompt = common_chat_template_direct_apply(tmpl, params);
16781726
params.add_generation_prompt = true;

0 commit comments

Comments
 (0)