Skip to content

Commit d70fd06

Browse files
refactor: implement streaming-aware universal reasoning parser
Remove the streaming mode limitation from --reasoning-format by refactoring try_parse_reasoning() to handle incremental parsing of <think> tags across all formats. - Rework try_parse_reasoning() to track whitespace, partial tags, and multiple reasoning segments, allowing proper separation of reasoning_content and content in streaming mode - Parse reasoning tags before tool call handling in content-only and Llama 3.x formats to ensure inline <think> blocks are captured correctly - Change default reasoning_format from 'auto' to 'deepseek' for consistent behavior - Add 'deepseek-legacy' option to preserve old inline behavior when needed - Update CLI help and documentation to reflect streaming support - Add parser tests for inline <think>...</think> segments The parser now continues processing content after </think> closes instead of stopping, enabling proper message.reasoning_content and message.content separation in both streaming and non-streaming modes. Fixes the issue where streaming responses would dump everything (including post-thinking content) into reasoning_content while leaving content empty.
1 parent 177a6be commit d70fd06

File tree

6 files changed

+138
-17
lines changed

6 files changed

+138
-17
lines changed

common/arg.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3429,8 +3429,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34293429
{"--reasoning-format"}, "FORMAT",
34303430
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
34313431
"- none: leaves thoughts unparsed in `message.content`\n"
3432-
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
3433-
"(default: auto)",
3432+
"- deepseek: puts thoughts in `message.reasoning_content`\n"
3433+
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
3434+
"(default: deepseek)",
34343435
[](common_params & params, const std::string & value) {
34353436
params.reasoning_format = common_reasoning_format_from_name(value);
34363437
}

common/chat-parser.cpp

Lines changed: 102 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
#include "log.h"
44
#include "regex-partial.h"
55

6+
#include <algorithm>
7+
#include <cctype>
68
#include <optional>
79
#include <stdexcept>
810
#include <string>
11+
#include <string_view>
912
#include <vector>
1013

1114
using json = nlohmann::ordered_json;
@@ -181,25 +184,111 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
181184
add_reasoning_content(stripped_reasoning);
182185
}
183186
};
184-
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
185-
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
186-
if (auto res = try_find_literal(end_think)) {
187-
handle_reasoning(res->prelude, /* closed */ true);
188-
consume_spaces();
189-
return true;
190-
}
191-
auto rest = consume_rest();
187+
188+
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
189+
return false;
190+
}
191+
192+
const size_t saved_pos = pos_;
193+
const size_t saved_content_size = result_.content.size();
194+
const size_t saved_reasoning_size = result_.reasoning_content.size();
195+
196+
auto restore_state = [&]() {
197+
move_to(saved_pos);
198+
result_.content.resize(saved_content_size);
199+
result_.reasoning_content.resize(saved_reasoning_size);
200+
};
201+
202+
// Allow leading whitespace to be preserved as content when reasoning is present at the start
203+
size_t cursor = pos_;
204+
size_t whitespace_end = cursor;
205+
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
206+
++whitespace_end;
207+
}
208+
209+
if (whitespace_end >= input_.size()) {
210+
restore_state();
211+
if (syntax_.thinking_forced_open) {
212+
auto rest = input_.substr(saved_pos);
192213
if (!rest.empty()) {
193214
handle_reasoning(rest, /* closed */ !is_partial());
194215
}
195-
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
196-
// if (!syntax_.thinking_forced_open) {
197-
// throw common_chat_msg_partial_exception(end_think);
198-
// }
216+
move_to(input_.size());
199217
return true;
200218
}
219+
return false;
220+
}
221+
222+
cursor = whitespace_end;
223+
const size_t remaining = input_.size() - cursor;
224+
const size_t start_prefix = std::min(start_think.size(), remaining);
225+
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
226+
227+
if (has_start_tag && start_prefix < start_think.size()) {
228+
move_to(input_.size());
229+
return true;
230+
}
231+
232+
if (has_start_tag) {
233+
if (whitespace_end > pos_) {
234+
add_content(input_.substr(pos_, whitespace_end - pos_));
235+
}
236+
cursor += start_think.size();
237+
} else if (syntax_.thinking_forced_open) {
238+
cursor = whitespace_end;
239+
} else {
240+
restore_state();
241+
return false;
242+
}
243+
while (true) {
244+
if (cursor >= input_.size()) {
245+
move_to(input_.size());
246+
return true;
247+
}
248+
249+
size_t end_pos = input_.find(end_think, cursor);
250+
if (end_pos == std::string::npos) {
251+
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
252+
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
253+
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
254+
if (reasoning_end > cursor) {
255+
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
256+
}
257+
move_to(input_.size());
258+
return true;
259+
}
260+
261+
if (end_pos > cursor) {
262+
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
263+
} else {
264+
handle_reasoning("", /* closed */ true);
265+
}
266+
267+
cursor = end_pos + end_think.size();
268+
269+
while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
270+
++cursor;
271+
}
272+
273+
const size_t next_remaining = input_.size() - cursor;
274+
if (next_remaining == 0) {
275+
move_to(cursor);
276+
return true;
277+
}
278+
279+
const size_t next_prefix = std::min(start_think.size(), next_remaining);
280+
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
281+
if (next_prefix < start_think.size()) {
282+
move_to(input_.size());
283+
return true;
284+
}
285+
cursor += start_think.size();
286+
continue;
287+
}
288+
289+
move_to(cursor);
290+
return true;
201291
}
202-
return false;
203292
}
204293

205294
std::string common_chat_msg_parser::consume_rest() {

common/chat.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,6 +1336,8 @@ static common_chat_params common_chat_params_init_apertus(const common_chat_temp
13361336
return data;
13371337
}
13381338
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
1339+
builder.try_parse_reasoning("<think>", "</think>");
1340+
13391341
if (!builder.syntax().parse_tool_calls) {
13401342
builder.add_content(builder.consume_rest());
13411343
return;
@@ -2786,6 +2788,7 @@ common_chat_params common_chat_templates_apply(
27862788
}
27872789

27882790
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
2791+
builder.try_parse_reasoning("<think>", "</think>");
27892792
builder.add_content(builder.consume_rest());
27902793
}
27912794

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ struct common_params {
432432
std::string chat_template = ""; // NOLINT
433433
bool use_jinja = false; // NOLINT
434434
bool enable_chat_template = true;
435-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
435+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
436436
int reasoning_budget = -1;
437437
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
438438

tests/test-chat-parser.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,34 @@ static void test_reasoning() {
106106
assert_equals("<think>Cogito</think>", builder.result().content);
107107
assert_equals("Ergo sum", builder.consume_rest());
108108
}
109+
{
110+
const std::string variant("content_only_inline_think");
111+
common_chat_syntax syntax = {
112+
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
113+
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
114+
/* .reasoning_in_content = */ false,
115+
/* .thinking_forced_open = */ false,
116+
/* .parse_tool_calls = */ false,
117+
};
118+
const std::string input = "<think>Pense</think>Bonjour";
119+
auto msg = common_chat_parse(input, false, syntax);
120+
assert_equals(variant, std::string("Pense"), msg.reasoning_content);
121+
assert_equals(variant, std::string("Bonjour"), msg.content);
122+
}
123+
{
124+
const std::string variant("llama_3_inline_think");
125+
common_chat_syntax syntax = {
126+
/* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X,
127+
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
128+
/* .reasoning_in_content = */ false,
129+
/* .thinking_forced_open = */ false,
130+
/* .parse_tool_calls = */ false,
131+
};
132+
const std::string input = "<think>Plan</think>Réponse";
133+
auto msg = common_chat_parse(input, false, syntax);
134+
assert_equals(variant, std::string("Plan"), msg.reasoning_content);
135+
assert_equals(variant, std::string("Réponse"), msg.content);
136+
}
109137
// Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
110138
{
111139
common_chat_syntax syntax = {

tools/server/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co
190190
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
191191
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
192192
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
193-
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
193+
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
194194
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
195195
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
196196
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |

0 commit comments

Comments
 (0)