fix trigger of thinking models (must happen after thoughts are closed)

Olivier Chafik · ochafik · commit eaeed7da6759 · 2025-03-13T23:22:16.000Z
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
@@ -12,8 +12,8 @@
 
 using json = nlohmann::ordered_json;
 
-common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, bool extract_reasoning)
-    : input_(input), is_partial_(is_partial), extract_reasoning_(extract_reasoning)
+common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_reasoning_syntax & reasoning_syntax)
+    : input_(input), is_partial_(is_partial), reasoning_syntax_(reasoning_syntax)
 {
     result_.role = "assistant";
 
@@ -129,14 +129,17 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
 }
 
 void common_chat_msg_parser::try_consume_think_tags(const common_regex & start_think_regex, const common_regex & end_think_regex) {
-    if (extract_reasoning_) {
-        if (try_consume_regex(start_think_regex)) {
+    if (reasoning_syntax_.format != COMMON_REASONING_FORMAT_NONE) {
+        if (reasoning_syntax_.thinking_forced_open || try_consume_regex(start_think_regex)) {
             if (auto res = try_find_regex(end_think_regex)) {
                 result_.reasoning_content = res->prelude;
                 consume_spaces();
             } else {
                 result_.reasoning_content = consume_rest();
-                incomplete("Failed to find end of reasoning tag " + end_think_regex.str());
+                if (!reasoning_syntax_.thinking_forced_open) {
+                    incomplete("Failed to find end of reasoning tag " + end_think_regex.str());
+                }
+                return;
             }
         } else if (auto res = try_find_regex(end_think_regex)) {
             result_.reasoning_content = res->prelude;
diff --git a/common/chat-parser.h b/common/chat-parser.h
@@ -8,8 +8,6 @@
 #include <string>
 #include <vector>
 
-using common_string_ranges = std::vector<common_string_range>;
-
 class common_chat_msg_partial_exception : public std::runtime_error {
   public:
     common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
@@ -18,18 +16,17 @@ class common_chat_msg_partial_exception : public std::runtime_error {
 class common_chat_msg_parser {
     std::string input_;
     bool is_partial_;
-    bool extract_reasoning_;
+    common_chat_reasoning_syntax reasoning_syntax_;
+
     size_t pos_ = 0;
     common_chat_msg result_;
     std::string healing_marker_;
 
   public:
-    common_chat_msg_parser(const std::string & input, bool is_partial, bool extract_reasoning);
-
+    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_reasoning_syntax & reasoning_syntax);
     const std::string & input() const { return input_; }
     const std::string & healing_marker() const { return healing_marker_; }
     const bool & is_partial() const { return is_partial_; }
-    const bool & extract_reasoning() const { return extract_reasoning_; }
     const common_chat_msg & result() const { return result_; }
 
     void move_to(size_t pos) {
@@ -70,13 +67,13 @@ class common_chat_msg_parser {
 
     struct find_regex_result {
         std::string prelude;
-        common_string_ranges groups;
+        std::vector<common_string_range> groups;
     };
 
     std::optional<find_regex_result> try_find_regex(const common_regex & regex);
 
     struct consume_regex_result {
-        common_string_ranges groups;
+        std::vector<common_string_range> groups;
     };
     consume_regex_result consume_regex(const common_regex & regex);
 
diff --git a/common/chat.cpp b/common/chat.cpp
diff --git a/common/chat.h b/common/chat.h
@@ -37,6 +37,8 @@ struct common_chat_msg {
     std::string tool_name;
     std::string tool_call_id;
 
+    template <class T> T to_json_oaicompat() const;
+
     bool empty() const {
         return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
     }
@@ -54,6 +56,21 @@ struct common_chat_msg {
     }
 };
 
+struct common_chat_msg_diff {
+    // std::string reasoning_content_delta;
+    std::string content_delta;
+    size_t tool_call_index = std::string::npos;
+    common_chat_tool_call tool_call_delta;
+
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
+
+    bool operator==(const common_chat_msg_diff & other) const {
+        return content_delta == other.content_delta
+        && tool_call_index == other.tool_call_index
+        && tool_call_delta == other.tool_call_delta;
+    }
+};
+
 struct common_chat_tool {
     std::string name;
     std::string description;
@@ -73,14 +90,11 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_LLAMA_3_X,
     COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
     COMMON_CHAT_FORMAT_DEEPSEEK_R1,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
-    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
-    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
 
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -95,19 +109,26 @@ struct common_chat_templates_inputs {
     std::vector<common_chat_tool> tools;
     common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
     bool parallel_tool_calls = false;
-    bool extract_reasoning     = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
 };
 
 struct common_chat_params {
     common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
     std::string                         prompt;
     std::string                         grammar;
     bool                                grammar_lazy = false;
+    bool                                thinking_forced_open = false;
     std::vector<common_grammar_trigger> grammar_triggers;
     std::vector<std::string>            preserved_tokens;
     std::vector<std::string>            additional_stops;
 };
 
+struct common_chat_reasoning_syntax {
+    common_reasoning_format format = COMMON_REASONING_FORMAT_NONE;
+    bool inlined_in_content        = false;
+    bool thinking_forced_open      = false;
+};
+
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
 
@@ -145,7 +166,7 @@ std::string common_chat_format_example(
     bool use_jinja);
 
 std::string               common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(const std::string & input, common_chat_format format, bool is_partial = false);
+common_chat_msg           common_chat_parse(const std::string & input, common_chat_format format, bool is_partial = false, const common_chat_reasoning_syntax & reasoning_syntax = {});
 
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
 
@@ -158,18 +179,3 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
 // T can be std::string containing JSON or nlohmann::ordered_json
 template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
 template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
-
-struct common_chat_msg_diff {
-    // std::string reasoning_content_delta;
-    std::string content_delta;
-    size_t tool_call_index = std::string::npos;
-    common_chat_tool_call tool_call_delta;
-
-    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
-
-    bool operator==(const common_chat_msg_diff & other) const {
-        return content_delta == other.content_delta
-            && tool_call_index == other.tool_call_index
-            && tool_call_delta == other.tool_call_delta;
-    }
-};
diff --git a/common/common.h b/common/common.h
@@ -114,7 +114,7 @@ enum common_grammar_trigger_type {
     COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
     COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
     COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
 };
 
 struct common_grammar_trigger {
diff --git a/common/regex-partial.cpp b/common/regex-partial.cpp
@@ -22,31 +22,34 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
             common_regex_match res;
             res.type = COMMON_REGEX_MATCH_TYPE_FULL;
             for (size_t i = 0; i < match.size(); ++i) {
-                common_string_range group;
-                group.begin = pos + match.position(i);
-                group.end = group.begin + match.length(i);
-                res.groups.push_back(group);
+                auto begin = pos + match.position(i);
+                res.groups.emplace_back(begin, begin + match.length(i));
             }
             return res;
         }
     }
     std::match_results<std::string::const_reverse_iterator> srmatch;
     if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
         auto group = srmatch[1].str();
-        auto it = srmatch[1].second.base();
-        // auto position = static_cast<size_t>(std::distance(input.begin(), it));
-        if ((!as_match && !at_start_) || it == input.begin()) {
-            common_regex_match res;
-            res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
-            //res.groups.push_back({input.substr(position), position, input.size()});
-            res.groups.push_back({pos + std::distance(input.begin(), it), input.size()});
-            return res;
+        if (group.length() != 0) {
+            auto it = srmatch[1].second.base();
+            // auto position = static_cast<size_t>(std::distance(input.begin(), it));
+            if ((!as_match && !at_start_) || it == input.begin()) {
+                common_regex_match res;
+                res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
+                auto begin = std::distance(input.begin(), it);
+                GGML_ASSERT(begin >= 0);
+                auto end = input.size();//begin + group.length();
+                GGML_ASSERT(static_cast<size_t>(begin) <= end);
+                res.groups.push_back({static_cast<size_t>(begin), end});
+                return res;
+            }
         }
     }
     return {};
 }
 
-/*
+/*xz
   Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
 
   Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
diff --git a/common/regex-partial.h b/common/regex-partial.h
@@ -2,6 +2,7 @@
 
 #include <regex>
 #include <string>
+#include "ggml.h"
 
 enum common_regex_match_type {
     COMMON_REGEX_MATCH_TYPE_NONE,
@@ -12,6 +13,11 @@ enum common_regex_match_type {
 struct common_string_range {
     size_t begin;
     size_t end;
+    common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
+        GGML_ASSERT(begin <= end);
+    }
+    // prevent default ctor
+    common_string_range() = delete;
     bool empty() const {
         return begin == end;
     }
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -160,7 +160,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
         GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
     } else {
-        std::vector<std::string> patterns_at_start;
+        std::vector<std::string> trigger_patterns;
         std::vector<std::string> patterns_anywhere;
         std::vector<llama_token> trigger_tokens;
         for (const auto & trigger : params.grammar_triggers) {
@@ -172,10 +172,13 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                     break;
                 }
                 case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
-                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
                 {
-                    const auto & pattern = trigger.value;
-                    (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
+                    patterns_anywhere.push_back(trigger.value);
+                    break;
+                }
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
+                {
+                    trigger_patterns.push_back(trigger.value);
                     break;
                 }
                 case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -189,10 +192,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
             }
         }
 
-        std::vector<std::string> trigger_patterns;
-        if (!patterns_at_start.empty()) {
-            trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
-        }
         if (!patterns_anywhere.empty()) {
             trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
         }
diff --git a/docs/function-calling.md b/docs/function-calling.md
@@ -329,32 +329,58 @@ Test in CLI (or with any library / software that can use OpenAI-compatible API b
 
 ```bash
 curl http://localhost:8080/v1/chat/completions -d '{
-"model": "gpt-3.5-turbo",
-"tools": [
-    {
-    "type":"function",
-    "function":{
-        "name":"python",
-        "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
-        "parameters":{
-        "type":"object",
-        "properties":{
-            "code":{
-            "type":"string",
-            "description":"The code to run in the ipython interpreter."
+    "model": "gpt-3.5-turbo",
+    "tools": [
+        {
+        "type":"function",
+        "function":{
+            "name":"python",
+            "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+            "parameters":{
+            "type":"object",
+            "properties":{
+                "code":{
+                "type":"string",
+                "description":"The code to run in the ipython interpreter."
+                }
+            },
+            "required":["code"]
             }
-        },
-        "required":["code"]
         }
-    }
-    }
-],
-"messages": [
-    {
-    "role": "user",
-    "content": "Print a hello world message with python."
-    }
-]
+        }
+    ],
+    "messages": [
+        {
+        "role": "user",
+        "content": "Print a hello world message with python."
+        }
+    ]
+}'
+
+
+curl http://localhost:8080/v1/chat/completions -d '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+        {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
+        {"role": "user", "content": "What is the weather in Istanbul?"}
+    ],
+    "tools": [{
+        "type":"function",
+        "function":{
+            "name":"get_current_weather",
+            "description":"Get the current weather in a given location",
+            "parameters":{
+            "type":"object",
+            "properties":{
+                "location":{
+                "type":"string",
+                "description":"The city and country/state, e.g. `San Francisco, CA`, or `Paris, France`"
+                }
+            },
+            "required":["location"]
+            }
+        }
+    }]
 }'
 ```
 
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp

Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co`
`160`	`160`	`GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");`
`161`	`161`	`#endif // LLAMA_USE_LLGUIDANCE`
`162`	`162`	`} else {`
`163`		`- std::vector<std::string> patterns_at_start;`
	`163`	`+ std::vector<std::string> trigger_patterns;`
`164`	`164`	`std::vector<std::string> patterns_anywhere;`
`165`	`165`	`std::vector<llama_token> trigger_tokens;`
`166`	`166`	`for (const auto & trigger : params.grammar_triggers) {`
`@@ -172,10 +172,13 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co`
`172`	`172`	`break;`
`173`	`173`	`}`
`174`	`174`	`case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:`
`175`		`- case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:`
`176`	`175`	`{`
`177`		`- const auto & pattern = trigger.value;`
`178`		`- (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);`
	`176`	`+ patterns_anywhere.push_back(trigger.value);`
	`177`	`+ break;`
	`178`	`+ }`
	`179`	`+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:`
	`180`	`+ {`
	`181`	`+ trigger_patterns.push_back(trigger.value);`
`179`	`182`	`break;`
`180`	`183`	`}`
`181`	`184`	`case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:`
`@@ -189,10 +192,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co`
`189`	`192`	`}`
`190`	`193`	`}`
`191`	`194`
`192`		`- std::vector<std::string> trigger_patterns;`
`193`		`- if (!patterns_at_start.empty()) {`
`194`		`- trigger_patterns.push_back("^(" + string_join(patterns_at_start, "\|") + ")[\\s\\S]*");`
`195`		`- }`
`196`	`195`	`if (!patterns_anywhere.empty()) {`
`197`	`196`	`trigger_patterns.push_back("^[\\s\\S]?(" + string_join(patterns_anywhere, "\|") + ")[\\s\\S]");`
`198`	`197`	`}`