@@ -33,21 +33,35 @@ static std::string extract_content_from_mixed_input(const std::string& content,
3333 return qwen3::extract_content_during_parsing (content, is_partial);
3434 } else if (is_deepseek_r1_model (model_name)) {
3535 // DeepSeek R1 content extraction - remove <think> tags and tool calls
36- std::string result = content;
37-
36+ constexpr std::string_view k_think_start{" <think>" };
37+ constexpr std::string_view k_think_end{" </think>" };
38+
39+ auto result = content;
40+
3841 // Remove <think>...</think> tags
3942 size_t think_start = 0 ;
40- while ((think_start = result.find (" <think>" , think_start)) != std::string::npos) {
41- size_t think_end = result.find (" </think>" , think_start);
43+ size_t tool_start = 0 ;
44+ bool is_thinking = false ;
45+ while ((think_start = result.find (k_think_start, think_start)) != std::string::npos) {
46+ size_t think_end = result.find (k_think_end, think_start);
4247 if (think_end != std::string::npos) {
43- result.erase (think_start, think_end + 8 - think_start);
48+ think_start = think_end + k_think_end.length ();
49+ tool_start = think_start;
50+ // result.erase(think_start, think_end + k_think_end.length() - think_start);
4451 } else {
52+ is_thinking = true ;
4553 break ;
4654 }
4755 }
48-
56+
57+ // Is this the right thing to do? If we have an open thinking tag, we just retrun and do not try to
58+ // remove function calls.
59+ if (is_thinking) {
60+ return result;
61+ }
62+
4963 // Remove DeepSeek R1 tool call syntax
50- size_t tool_start = 0 ;
64+ // size_t tool_start = 0;
5165 while ((tool_start = result.find (" <|tool▁calls▁begin|>" , tool_start)) != std::string::npos) {
5266 size_t tool_end = result.find (" <|tool▁calls▁end|>" , tool_start);
5367 if (tool_end != std::string::npos) {
@@ -56,7 +70,7 @@ static std::string extract_content_from_mixed_input(const std::string& content,
5670 break ;
5771 }
5872 }
59-
73+
6074 return result;
6175 } else {
6276 return kimi_k2::extract_content_during_parsing (content, is_partial);
@@ -67,21 +81,21 @@ static std::string extract_content_from_mixed_input(const std::string& content,
6781static ik_chat_msg parse_chat_message_incremental (const std::string& content, bool is_partial = false , const std::string& model_name = " " ) {
6882 ik_chat_msg msg;
6983 msg.role = " assistant" ;
70-
84+
7185 try {
7286 json tool_calls_json;
7387 bool has_function_syntax = false ;
74-
88+
7589 // Route parsing based on model type
7690 if (is_qwen3_model (model_name)) {
7791 // Use Qwen3 XML parser
7892 tool_calls_json = parse_qwen3_tool_calls (content);
79-
93+
8094 // Check for partial content during streaming
8195 if (is_partial && qwen3::is_partial_content_advanced (content)) {
8296 throw std::runtime_error (" partial structured content detected" );
8397 }
84-
98+
8599 // Check for malformed XML tool call syntax
86100 has_function_syntax = content.find (" <tool_call>" ) != std::string::npos;
87101 } else if (is_deepseek_r1_model (model_name)) {
@@ -92,11 +106,11 @@ static ik_chat_msg parse_chat_message_incremental(const std::string& content, bo
92106 syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
93107 syntax.reasoning_in_content = true ; // Fix for thinking tag termination issue
94108 syntax.enable_tool_calls = true ;
95-
109+
96110 common_chat_msg_parser parser (content, is_partial, syntax);
97111 parser.parse ();
98112 auto result = parser.result ();
99-
113+
100114 // Convert tool calls to JSON format expected by the system
101115 tool_calls_json = json::array ();
102116 for (const auto & tool_call : result.tool_calls ) {
@@ -107,7 +121,7 @@ static ik_chat_msg parse_chat_message_incremental(const std::string& content, bo
107121 tc[" function" ][" arguments" ] = tool_call.arguments ;
108122 tool_calls_json.push_back (tc);
109123 }
110-
124+
111125 // Check for malformed DeepSeek R1 tool call syntax
112126 has_function_syntax = content.find (" <|tool▁calls▁begin|>" ) != std::string::npos;
113127 } catch (const common_chat_msg_partial_exception&) {
@@ -121,44 +135,44 @@ static ik_chat_msg parse_chat_message_incremental(const std::string& content, bo
121135 } else {
122136 // Default to Kimi-K2 parser
123137 tool_calls_json = parse_kimi_k2_tool_calls (content);
124-
138+
125139 // Check for partial content during streaming
126140 if (is_partial && kimi_k2::is_partial_content_advanced (content)) {
127141 throw std::runtime_error (" partial structured content detected" );
128142 }
129-
143+
130144 // Check for malformed function call syntax
131145 has_function_syntax = content.find (" functions." ) != std::string::npos;
132146 }
133-
147+
134148 bool parsing_succeeded = !tool_calls_json.empty ();
135-
149+
136150 if (has_function_syntax && !parsing_succeeded) {
137151 throw std::runtime_error (" malformed function call syntax detected" );
138152 }
139-
140- // Process successful parsing results
153+
154+ // Process successful parsing results
141155 if (!tool_calls_json.empty ()) {
142156 for (const auto & tc_json : tool_calls_json) {
143157 try {
144158 ik_chat_tool_call tc;
145159 tc.id = tc_json.value (" id" , " " );
146-
160+
147161 if (!tc_json.contains (" function" ) || !tc_json[" function" ].is_object () || !tc_json[" function" ].contains (" name" )) {
148162 continue ;
149163 }
150-
164+
151165 tc.name = tc_json[" function" ][" name" ];
152166 if (tc.name .empty ()) {
153167 continue ;
154168 }
155-
169+
156170 if (tc_json[" function" ].contains (" arguments" )) {
157171 tc.arguments = tc_json[" function" ][" arguments" ];
158172 } else {
159173 tc.arguments = " {}" ;
160174 }
161-
175+
162176 // Validate arguments (only if not partial)
163177 if (!is_partial && !tc.arguments .empty ()) {
164178 try {
@@ -168,13 +182,13 @@ static ik_chat_msg parse_chat_message_incremental(const std::string& content, bo
168182 continue ;
169183 }
170184 }
171-
185+
172186 msg.tool_calls .push_back (tc);
173187 } catch (const std::exception&) {
174188 continue ;
175189 }
176190 }
177-
191+
178192 // Use model-specific content extraction
179193 if (is_qwen3_model (model_name)) {
180194 msg.content = qwen3::extract_content_during_parsing (content, is_partial);
@@ -193,27 +207,27 @@ static ik_chat_msg parse_chat_message_incremental(const std::string& content, bo
193207 msg.content = kimi_k2::extract_content_during_parsing (content, is_partial);
194208 }
195209 }
196-
210+
197211 } catch (const std::exception& e) {
198212 if (!is_partial) {
199213 // Original llama.cpp fallback pattern - use public API
200214 common_chat_syntax syntax;
201215 syntax.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; // Use content-only format
202-
216+
203217 // Use the public API that handles fallback internally
204218 common_chat_msg fallback_result = common_chat_parse (content, is_partial, syntax);
205-
219+
206220 // Convert to ik_chat_msg
207221 msg.tool_calls .clear ();
208222 msg.content = fallback_result.content ;
209223 }
210224 // If is_partial=true, keep empty result (no content chunks during streaming)
211225 }
212-
226+
213227 return msg;
214228}
215229
216230static std::string generate_tool_call_id () {
217231 static int counter = 0 ;
218232 return " call_" + std::to_string (++counter);
219- }
233+ }
0 commit comments