fix thinking model's initial trigger (take 2) + test qwq's template

ochafik · ochafik · commit 7a61eca01579 · 2025-03-15T21:01:26.000Z
diff --git a/common/chat.cpp b/common/chat.cpp
@@ -899,13 +899,15 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
             schema["maxItems"] = 1;
         }
         builder.add_rule("root",
-            std::string(data.thinking_forced_open ? "\"<|END_THINKING|>\" space " : "") +
+            std::string(data.thinking_forced_open ? "( \"<|END_THINKING|>\" space )? " : "") +
             "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
     });
     data.grammar_triggers.push_back({
         COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
-        std::string(data.thinking_forced_open ? "[\\s\\S]*?(<\\|END_THINKING\\|>)" : "(?:<\\|START_THINKING\\|>[\\s\\S]*?(<\\|END_THINKING\\|>))?") +
-            "\\s*(<\\|START_ACTION\\|>)[\\s\\S]*"
+        // If thinking_forced_open, then we capture the </think> tag in the grammar,
+        // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+        std::string(data.thinking_forced_open ? "[\\s\\S]*?(<\\|END_THINKING\\|>\\s*)" : "(?:<\\|START_THINKING\\|>[\\s\\S]*?<\\|END_THINKING\\|>\\s*)?") +
+            "(<\\|START_ACTION\\|>)[\\s\\S]*"
     });
     data.preserved_tokens = {
         "<|START_ACTION|>",
@@ -1140,15 +1142,17 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
             // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
             // so we accept common variants (then it's all constrained)
             builder.add_rule("root",
-                std::string(data.thinking_forced_open ? "\"</think>\" space " : "") +
+                std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
                 "( \"<｜tool▁calls▁begin｜>\" | \"<｜tool_calls_begin｜>\" | \"<｜tool calls begin｜>\" | \"<｜tool\\\\_calls\\\\_begin｜>\" | \"<｜tool▁calls｜>\" ) "
                 "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
                 "\"<｜tool▁calls▁end｜>\""
                 " space");
             data.grammar_triggers.push_back({
                 COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
-                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>)" : "(?:<think>[\\s\\S]*?(</think>))?") +
-                    "\\s*(<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>|<｜tool▁calls｜>)[\\s\\S]*"
+                // If thinking_forced_open, then we capture the </think> tag in the grammar,
+                // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
+                    "(<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>|<｜tool▁calls｜>)[\\s\\S]*"
             });
             data.preserved_tokens = {
                 "<think>",
@@ -1430,13 +1434,15 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
             "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
         auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
         builder.add_rule("root",
-            std::string(data.thinking_forced_open ? "\"</think>\" space " : "") +
+            std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
             (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
         // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
         data.grammar_triggers.push_back({
             COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
-            std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>)" : "(?:<think>[\\s\\S]*?(</think>))?") + (
-                "\\s*(<tool_call>"
+            // If thinking_forced_open, then we capture the </think> tag in the grammar,
+            // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+            std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
+                "(<tool_call>"
                 "|<function"
                 "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\""
                 ")[\\s\\S]*"
@@ -1490,12 +1496,13 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
 
     auto start = builder.pos();
     if (auto res = builder.try_find_regex(open_regex)) {
-        if (res->groups[0].begin != start && res->groups[4].empty() && res->groups[5].empty()) {
-            // The only syntax we allow after the very start is <function=...> or <function name=...>
+        if (res->groups[0].begin != start && builder.str(res->groups[2]) != "<tool_call>" &&  res->groups[4].empty() && res->groups[5].empty()) {
+            // The only syntaxes we allow after the very start are <tool_call>, <function=...> or <function name=...>
+            builder.move_to(start);
             builder.add_content(builder.consume_rest());
             return;
         }
-        GGML_ASSERT(res->prelude.empty()); // matching at_start
+        builder.add_content(res->prelude);
 
         const auto & block_start = res->groups[1];
         std::string block_end = block_start.empty() ? "" : "```";
diff --git a/models/templates/Qwen-QwQ-32B.jinja b/models/templates/Qwen-QwQ-32B.jinja
@@ -0,0 +1,62 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- '' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+  {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" and not message.tool_calls %}
+        {%- set content = message.content %}
+        {%- if not loop.last %}
+            {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+        {%- endif %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- if not loop.last %}
+            {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+        {%- endif %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n<think>\n' }}
+{%- endif %}
diff --git a/models/templates/README.md b/models/templates/README.md
@@ -19,4 +19,5 @@ These templates can be updated with the following commands:
 ./scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use > models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
 ./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use   > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
 ./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct                      > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
+./scripts/get_chat_template.py Qwen/QwQ-32B                                  > models/templates/Qwen-QwQ-32B.jinja
 ```
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
@@ -1180,7 +1180,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
                     // get from the first matched capturing group to the end of the string
                     size_t start = std::string::npos;
                     for (auto i = 1u; i < match.size(); i++) {
-                        if (!match.str(i).empty()) {
+                        if (match.length(i) > 0) {
                             start = match.position(i);
                             break;
                         }
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
@@ -327,7 +327,17 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
                     {
                         const auto & pattern = trigger.value;
                         if (std::regex_match(constrained, match, std::regex(pattern))) {
-                            pos = match.position(1);
+                            auto mpos = std::string::npos;
+                            for (size_t i = 1; i < match.size(); ++i) {
+                                if (match[i].length() > 0) {
+                                    mpos = match.position(i);
+                                    break;
+                                }
+                            }
+                            if (mpos == std::string::npos) {
+                                mpos = match.position(0);
+                            }
+                            pos = mpos;
                         }
                         break;
                     }
@@ -469,6 +479,15 @@ const common_chat_msg message_assist_call {
     /* .tool_name = */ "",
     /* .tool_call_id = */ "",
 };
+const common_chat_msg message_assist_call_content {
+    "assistant",
+    "Hello, world!\nWhat's up?",
+    /* .content_parts = */ {},
+    tool_calls,
+    /* .reasoning_content = */ "",
+    /* .tool_name = */ "",
+    /* .tool_call_id = */ "",
+};
 const common_chat_msg message_assist_thoughts_no_content {
     "assistant",
     "",
@@ -722,8 +741,11 @@ static void test_template_output_parsers() {
         auto tmpls = read_templates("models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja");
         std::vector<std::string>   end_tokens{ "<|END_OF_TURN_TOKEN|>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B,                   common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
-        assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B,                   common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+        for (const auto & inputs : { inputs_no_tools, inputs_tools }) {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs);
+            assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B, params.format);
+            assert_equals(false, params.thinking_forced_open);
+        }
 
         assert_msg_equals(message_assist,
             common_chat_parse(
@@ -877,11 +899,25 @@ static void test_template_output_parsers() {
             tmpls.get(), end_tokens, message_assist_call_id, tools,
             "[TOOL_CALLS][{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, \"id\": \"123456789\"}]");
     }
+    {
+        auto tmpls = read_templates("models/templates/Qwen-QwQ-32B.jinja");
+        std::vector<std::string> end_tokens{ "<|im_end|>" };
+
+        for (const auto & inputs : { inputs_no_tools, inputs_tools }) {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs);
+            assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, params.format);
+            assert_equals(true, params.thinking_forced_open);
+        }
+    }
     {
         auto tmpls = read_templates("models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja");
         std::vector<std::string> end_tokens{ "<|im_end|>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+        for (const auto & inputs : { inputs_no_tools, inputs_tools }) {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs);
+            assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, params.format);
+            assert_equals(false, params.thinking_forced_open);
+        }
         assert_equals(
             COMMON_CHAT_FORMAT_HERMES_2_PRO,
             common_chat_templates_apply(
@@ -937,6 +973,13 @@ static void test_template_output_parsers() {
                 "</tool_call>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(message_assist_call_content,
+            common_chat_parse(
+                "Hello, world!\nWhat's up?<tool_call>\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
             common_chat_parse(
@@ -1066,6 +1109,27 @@ static void test_template_output_parsers() {
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
 
+        assert_msg_equals(
+            {
+                /* .role = */ "assistant",
+                "This is not a tool call:\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
+                /* .content_parts = */ {},
+                /* .tool_calls = */ {},
+                /* .reasoning_content = */ "",
+                /* .tool_name = */ "",
+                /* .tool_call_id = */ "",
+            },
+            common_chat_parse(
+                "This is not a tool call:\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(message_assist,
+            common_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
             common_chat_parse(
                 "<think>I'm thinking</think>Hello, world!\nWhat's up?",
@@ -1162,7 +1226,7 @@ static void test_template_output_parsers() {
                     is_partial,
                     {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1}));
         }
-            
+
         test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
                       "<function=special_function>{\"arg1\": 1}</function>");
@@ -1243,7 +1307,11 @@ static void test_template_output_parsers() {
         auto tmpls = read_templates("models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja");
         std::vector<std::string>   end_tokens{ "<｜end▁of▁sentence｜>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1,                   common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+        for (const auto & inputs : { inputs_no_tools, inputs_tools }) {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs);
+            assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, params.format);
+            assert_equals(true, params.thinking_forced_open);
+        }
 
         test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);

Original file line number	Diff line number	Diff line change
`@@ -1180,7 +1180,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token`
`1180`	`1180`	`// get from the first matched capturing group to the end of the string`
`1181`	`1181`	`size_t start = std::string::npos;`
`1182`	`1182`	`for (auto i = 1u; i < match.size(); i++) {`
`1183`		`- if (!match.str(i).empty()) {`
	`1183`	`+ if (match.length(i) > 0) {`
`1184`	`1184`	`start = match.position(i);`
`1185`	`1185`	`break;`
`1186`	`1186`	`}`