Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3425,7 +3425,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
"- none: leaves thoughts unparsed in `message.content`\n"
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
"- deepseek: puts thoughts in `message.reasoning_content`\n"
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
"(default: auto)",
[](common_params & params, const std::string & value) {
params.reasoning_format = common_reasoning_format_from_name(value);
Expand Down
138 changes: 125 additions & 13 deletions common/chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
#include "log.h"
#include "regex-partial.h"

#include <algorithm>
#include <cctype>
#include <optional>
#include <stdexcept>
#include <string>
#include <string_view>
#include <vector>

using json = nlohmann::ordered_json;
Expand Down Expand Up @@ -166,6 +169,23 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
}

bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
std::string pending_reasoning_prefix;

auto set_reasoning_prefix = [&](size_t prefix_pos) {
if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
return;
}
if (prefix_pos + start_think.size() > input_.size()) {
pending_reasoning_prefix.clear();
return;
}
// Capture the exact literal that opened the reasoning section so we can
// surface it back to callers. This ensures formats that force the
// reasoning tag open (e.g. DeepSeek R1) retain their original prefix
// instead of dropping it during parsing.
pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
};

auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
auto stripped_reasoning = string_strip(reasoning);
if (stripped_reasoning.empty()) {
Expand All @@ -178,28 +198,120 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
}
} else {
if (!pending_reasoning_prefix.empty()) {
add_reasoning_content(pending_reasoning_prefix);
pending_reasoning_prefix.clear();
}
add_reasoning_content(stripped_reasoning);
}
};
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
if (auto res = try_find_literal(end_think)) {
handle_reasoning(res->prelude, /* closed */ true);
consume_spaces();
return true;
}
auto rest = consume_rest();

if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
return false;
}

const size_t saved_pos = pos_;
const size_t saved_content_size = result_.content.size();
const size_t saved_reasoning_size = result_.reasoning_content.size();

auto restore_state = [&]() {
move_to(saved_pos);
result_.content.resize(saved_content_size);
result_.reasoning_content.resize(saved_reasoning_size);
};

// Allow leading whitespace to be preserved as content when reasoning is present at the start
size_t cursor = pos_;
size_t whitespace_end = cursor;
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
++whitespace_end;
}

if (whitespace_end >= input_.size()) {
restore_state();
if (syntax_.thinking_forced_open) {
auto rest = input_.substr(saved_pos);
if (!rest.empty()) {
handle_reasoning(rest, /* closed */ !is_partial());
}
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
// if (!syntax_.thinking_forced_open) {
// throw common_chat_msg_partial_exception(end_think);
// }
move_to(input_.size());
return true;
}
return false;
}

cursor = whitespace_end;
const size_t remaining = input_.size() - cursor;
const size_t start_prefix = std::min(start_think.size(), remaining);
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;

if (has_start_tag && start_prefix < start_think.size()) {
move_to(input_.size());
return true;
}

if (has_start_tag) {
if (whitespace_end > pos_) {
add_content(input_.substr(pos_, whitespace_end - pos_));
}
set_reasoning_prefix(cursor);
cursor += start_think.size();
} else if (syntax_.thinking_forced_open) {
cursor = whitespace_end;
} else {
restore_state();
return false;
}
while (true) {
if (cursor >= input_.size()) {
move_to(input_.size());
return true;
}

size_t end_pos = input_.find(end_think, cursor);
if (end_pos == std::string::npos) {
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
if (reasoning_end > cursor) {
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
}
move_to(input_.size());
return true;
}

if (end_pos > cursor) {
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
} else {
handle_reasoning("", /* closed */ true);
}

cursor = end_pos + end_think.size();

while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
++cursor;
}

const size_t next_remaining = input_.size() - cursor;
if (next_remaining == 0) {
move_to(cursor);
return true;
}

const size_t next_prefix = std::min(start_think.size(), next_remaining);
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
if (next_prefix < start_think.size()) {
move_to(input_.size());
return true;
}
set_reasoning_prefix(cursor);
cursor += start_think.size();
continue;
}

move_to(cursor);
return true;
}
return false;
}

std::string common_chat_msg_parser::consume_rest() {
Expand Down
3 changes: 3 additions & 0 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1408,6 +1408,8 @@ static common_chat_params common_chat_params_init_apertus(const common_chat_temp
return data;
}
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
builder.try_parse_reasoning("<think>", "</think>");

if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
Expand Down Expand Up @@ -2862,6 +2864,7 @@ common_chat_params common_chat_templates_apply(
}

static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<think>", "</think>");
builder.add_content(builder.consume_rest());
}

Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ struct common_params {
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response

Expand Down
28 changes: 28 additions & 0 deletions tests/test-chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,34 @@ static void test_reasoning() {
assert_equals("<think>Cogito</think>", builder.result().content);
assert_equals("Ergo sum", builder.consume_rest());
}
{
const std::string variant("content_only_inline_think");
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
};
const std::string input = "<think>Pense</think>Bonjour";
auto msg = common_chat_parse(input, false, syntax);
assert_equals(variant, std::string("Pense"), msg.reasoning_content);
assert_equals(variant, std::string("Bonjour"), msg.content);
}
{
const std::string variant("llama_3_inline_think");
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
};
const std::string input = "<think>Plan</think>Réponse";
auto msg = common_chat_parse(input, false, syntax);
assert_equals(variant, std::string("Plan"), msg.reasoning_content);
assert_equals(variant, std::string("Réponse"), msg.content);
}
// Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
{
common_chat_syntax syntax = {
Expand Down
2 changes: 1 addition & 1 deletion tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
<script lang="ts">
import { getDeletionInfo } from '$lib/stores/chat.svelte';
import { copyToClipboard } from '$lib/utils/copy';
import { parseThinkingContent } from '$lib/utils/thinking';
import ChatMessageAssistant from './ChatMessageAssistant.svelte';
import ChatMessageUser from './ChatMessageUser.svelte';

Expand Down Expand Up @@ -47,26 +46,13 @@

let thinkingContent = $derived.by(() => {
if (message.role === 'assistant') {
if (message.thinking) {
return message.thinking;
}

const parsed = parseThinkingContent(message.content);
const trimmedThinking = message.thinking?.trim();

return parsed.thinking;
return trimmedThinking ? trimmedThinking : null;
}
return null;
});

let messageContent = $derived.by(() => {
if (message.role === 'assistant') {
const parsed = parseThinkingContent(message.content);
return parsed.cleanContent?.replace('<|channel|>analysis', '');
}

return message.content?.replace('<|channel|>analysis', '');
});

function handleCancelEdit() {
isEditing = false;
editedContent = message.content;
Expand Down Expand Up @@ -165,7 +151,7 @@
{editedContent}
{isEditing}
{message}
{messageContent}
messageContent={message.content}
onCancelEdit={handleCancelEdit}
onConfirmDelete={handleConfirmDelete}
onCopy={handleCopy}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,11 @@
</div>
</div>
{:else if message.role === 'assistant'}
<MarkdownContent content={messageContent || ''} />
{#if config().disableReasoningFormat}
<pre class="raw-output">{messageContent || ''}</pre>
{:else}
<MarkdownContent content={messageContent || ''} />
{/if}
{:else}
<div class="text-sm whitespace-pre-wrap">
{messageContent}
Expand Down Expand Up @@ -203,4 +207,22 @@
background-position: -200% 0;
}
}

.raw-output {
width: 100%;
max-width: 48rem;
margin-top: 1.5rem;
padding: 1rem 1.25rem;
border-radius: 1rem;
background: hsl(var(--muted) / 0.3);
color: var(--foreground);
font-family:
ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
'Liberation Mono', Menlo, monospace;
font-size: 0.875rem;
line-height: 1.6;
white-space: pre-wrap;
word-break: break-word;
}

</style>
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@
key: 'showThoughtInProgress',
label: 'Show thought in progress',
type: 'checkbox'
},
{
key: 'disableReasoningFormat',
label: 'Show raw LLM output without backend parsing and frontend Markdown rendering to inspect streaming across different models.',
type: 'checkbox'
}
]
},
Expand Down
3 changes: 3 additions & 0 deletions tools/server/webui/src/lib/constants/settings-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
theme: 'system',
showTokensPerSecond: false,
showThoughtInProgress: false,
disableReasoningFormat: false,
keepStatsVisible: false,
askForTitleConfirmation: false,
pasteLongTextToFileLen: 2500,
Expand Down Expand Up @@ -76,6 +77,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
showTokensPerSecond: 'Display generation speed in tokens per second during streaming.',
showThoughtInProgress: 'Expand thought process by default when generating messages.',
disableReasoningFormat:
'Show raw LLM output without backend parsing and frontend Markdown rendering to inspect streaming across different models.',
keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
askForTitleConfirmation:
'Ask for confirmation before automatically changing conversation title when editing the first message.',
Expand Down
Loading
Loading