Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3442,6 +3442,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"- none: leaves thoughts unparsed in `message.content`\n"
"- deepseek: puts thoughts in `message.reasoning_content`\n"
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
"- minimax-m2: streams a synthetic opening `<think>` and keeps `</think>` tags in `message.content`\n"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we name this something more generic? like synthetic

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ngxson I've moved as much as possible to chat.cpp. For parameter naming, I kept consistency with existing formats, treating the first model (DeepSeek) as the "parent" behavior reference.

However, we could prepare a more modular refactor by renaming the parameters to better reflect their actual behavior:

  • none -> disables the backend parser (name already good)
  • deepseek -> remove or document it's an "auto" alias (most used, backend reasoning parser, writes reasoning inside reasoning_content chunks: the OpenAI-compatible target)
  • deepseek-legacy -> rename to clone or something clearer? (inline <think> tags + duplicate inside reasoning_content = Legacy+OAI-Compat mirroring, I don't have a use case for this)
  • minimax-m2 (this PR) -> inline reasoning tags + adds a missing <think> opening tag

To make this truly generic, we'd need an additional parameter to define the prepended string instead of hardcoding <think>. Use case: anyone dealing with Jinja templates that pre-open reasoning tags, causing the model to not regenerate them, making subsequent parsing difficult?

Would you prefer I open a follow-up issue to discuss a more generic synthetic-prefix approach with configurable strings?

"(default: auto)",
[](common_params & params, const std::string & value) {
params.reasoning_format = common_reasoning_format_from_name(value);
Expand Down
3 changes: 2 additions & 1 deletion common/chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
std::string pending_reasoning_prefix;

if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE ||
syntax_.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) {
return false;
}

Expand Down
10 changes: 8 additions & 2 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
case COMMON_REASONING_FORMAT_AUTO: return "auto";
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
case COMMON_REASONING_FORMAT_MINIMAX_M2: return "minimax-m2";
default:
throw std::runtime_error("Unknown reasoning format");
}
Expand All @@ -669,6 +670,8 @@ common_reasoning_format common_reasoning_format_from_name(const std::string & fo
return COMMON_REASONING_FORMAT_DEEPSEEK;
} else if (format == "deepseek-legacy") {
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
} else if (format == "minimax-m2") {
return COMMON_REASONING_FORMAT_MINIMAX_M2;
}
throw std::runtime_error("Unknown reasoning format: " + format);
}
Expand Down Expand Up @@ -1790,7 +1793,8 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
// </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
common_chat_parse_deepseek_v3_1_content(builder);
} else {
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE ||
builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) {
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
common_chat_parse_deepseek_v3_1_content(builder);
return;
Expand Down Expand Up @@ -2001,7 +2005,9 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {

if (regex_match(analysis_regex, header)) {
builder.move_to(header_start_pos);
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE ||
builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2 ||
builder.syntax().reasoning_in_content) {
builder.add_content(consume_end(true));
} else {
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ enum common_reasoning_format {
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
COMMON_REASONING_FORMAT_MINIMAX_M2, // Stream a synthetic opening <think> tag and keep </think> tags in `message.content` for MiniMax-M2 compatibility
// do not extend this enum unless you absolutely have to
// in most cases, use COMMON_REASONING_FORMAT_AUTO
// see: https://github.com/ggml-org/llama.cpp/pull/15408
Expand Down
2 changes: 1 addition & 1 deletion tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>- minimax-m2: Stream a synthetic opening <think> tag and keep </think> tags in `message.content` for MiniMax-M2 compatibility<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
Expand Down
73 changes: 62 additions & 11 deletions tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,10 @@ struct server_task {
}
common_reasoning_format reasoning_format = params_base.reasoning_format;
if (data.contains("reasoning_format")) {
reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
const auto requested = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
if (requested != COMMON_REASONING_FORMAT_AUTO) {
reasoning_format = requested;
}
}
params.oaicompat_chat_syntax.reasoning_format = reasoning_format;
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
Expand Down Expand Up @@ -1660,6 +1663,8 @@ struct server_slot {
bool has_next_token = true;
bool has_new_line = false;
bool truncated = false;
bool minimax_reasoning_prefix_injected = false;
bool minimax_reasoning_prefix_streamed = false;

stop_type stop;

Expand Down Expand Up @@ -1730,6 +1735,8 @@ struct server_slot {
generated_text = "";
has_new_line = false;
truncated = false;
minimax_reasoning_prefix_injected = false;
minimax_reasoning_prefix_streamed = false;
stop = STOP_TYPE_NONE;
stopping_word = "";
n_sent_text = 0;
Expand Down Expand Up @@ -1856,9 +1863,13 @@ struct server_slot {
GGML_ASSERT(task);

auto previous_msg = chat_msg;
SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
std::string text_to_parse = generated_text;
if (minimax_reasoning_prefix_injected) {
text_to_parse.insert(0, "<think>\n");
}
SRV_DBG("Parsing chat message: %s\n", text_to_parse.c_str());
auto new_msg = common_chat_parse(
generated_text,
text_to_parse,
/* is_partial= */ stop != STOP_TYPE_EOS,
task->params.oaicompat_chat_syntax);
if (!new_msg.empty()) {
Expand Down Expand Up @@ -2793,6 +2804,11 @@ struct server_context {

slot.state = SLOT_STATE_STARTED;

const bool needs_minimax_prefix =
slot.task->params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2;
slot.minimax_reasoning_prefix_injected = needs_minimax_prefix;
slot.minimax_reasoning_prefix_streamed = false;

SLT_INF(slot, "%s", "processing task\n");

return true;
Expand Down Expand Up @@ -2848,9 +2864,30 @@ struct server_context {
result.text_to_send = "";
}

std::string delta_to_send = result.text_to_send;
result.text_to_send = token_str;
slot.add_token(result);
if (slot.task->params.stream) {
send_partial_response(slot, result, false);
result.text_to_send = std::move(delta_to_send);

auto stream_with_minimax_prefix = [&](const completion_token_output & chunk) {
if (!slot.task->params.stream) {
return;
}

if (slot.minimax_reasoning_prefix_injected && !slot.minimax_reasoning_prefix_streamed) {
completion_token_output prefix_chunk{};
prefix_chunk.tok = LLAMA_TOKEN_NULL;
prefix_chunk.prob = 0.0f;
prefix_chunk.text_to_send = "<think>\n";
send_partial_response(slot, prefix_chunk, false);
slot.minimax_reasoning_prefix_streamed = true;
}

send_partial_response(slot, chunk, false);
};

if (send_text) {
stream_with_minimax_prefix(result);
}
}

Expand Down Expand Up @@ -3021,7 +3058,11 @@ struct server_context {
return true;
}

void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) {
void send_partial_response(
server_slot & slot,
const completion_token_output & tkn,
bool is_progress,
const std::vector<common_chat_msg_diff> * forced_diffs = nullptr) {
auto res = std::make_unique<server_task_result_cmpl_partial>();

res->id = slot.task->id;
Expand All @@ -3035,9 +3076,15 @@ struct server_context {
res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt / 1000);
} else {
res->content = tkn.text_to_send;
res->tokens = { tkn.tok };
if (tkn.tok != LLAMA_TOKEN_NULL) {
res->tokens = { tkn.tok };
}

slot.update_chat_msg(res->oaicompat_msg_diffs);
if (forced_diffs) {
res->oaicompat_msg_diffs = *forced_diffs;
} else {
slot.update_chat_msg(res->oaicompat_msg_diffs);
}
}

res->n_decoded = slot.n_decoded;
Expand All @@ -3050,7 +3097,7 @@ struct server_context {
res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id;

// populate res.probs_output
if (slot.task->params.sampling.n_probs > 0) {
if (slot.task->params.sampling.n_probs > 0 && tkn.tok != LLAMA_TOKEN_NULL) {
res->prob_output = tkn; // copy the token probs
}

Expand All @@ -3068,8 +3115,12 @@ struct server_context {
res->id = slot.task->id;
res->id_slot = slot.id;

res->index = slot.task->index;
res->content = slot.generated_text;
res->index = slot.task->index;
std::string response_content = slot.generated_text;
if (slot.minimax_reasoning_prefix_injected) {
response_content.insert(0, "<think>\n");
}
res->content = std::move(response_content);
res->tokens = std::move(slot.generated_tokens);
res->timings = slot.get_timings();
res->prompt = slot.task->tokens.detokenize(ctx, true);
Expand Down
Loading