Skip to content

Commit f55b71f

Browse files
server: add minimax-m2 reasoning format override for MiniMax-M2 compatibility
MiniMax-M2 models require the complete <think>...</think> block including tags to be present in the context for proper reasoning. This mode injects a synthetic opening <think> tag in the stream while keeping all reasoning tags inline in message.content, ensuring the model receives the full reasoning block it needs. Changes: - Add COMMON_REASONING_FORMAT_MINIMAX_M2 enum value to common_reasoning_format - Implement minimax-m2 format parsing that bypasses reasoning extraction - Inject synthetic <think>\n chunk at slot start when minimax-m2 is active - Track injection state with minimax_reasoning_prefix_injected slot flag - Prepend <think>\n to generated_text for final response and chat parsing - Prevent client reasoning_format=auto from overriding server CLI setting - Add minimax-m2 to CLI help, README.md, and code documentation - Handle LLAMA_TOKEN_NULL in send_partial_response to skip token recording - Update process_token to preserve delta_to_send for streaming correctness
1 parent 7db35a7 commit f55b71f

File tree

6 files changed

+61
-13
lines changed

6 files changed

+61
-13
lines changed

common/arg.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3442,6 +3442,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34423442
"- none: leaves thoughts unparsed in `message.content`\n"
34433443
"- deepseek: puts thoughts in `message.reasoning_content`\n"
34443444
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
3445+
"- minimax-m2: streams a synthetic opening `<think>` and keeps `</think>` tags in `message.content`\n"
34453446
"(default: auto)",
34463447
[](common_params & params, const std::string & value) {
34473448
params.reasoning_format = common_reasoning_format_from_name(value);

common/chat-parser.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,8 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
171171
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
172172
std::string pending_reasoning_prefix;
173173

174-
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
174+
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE ||
175+
syntax_.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) {
175176
return false;
176177
}
177178

common/chat.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
655655
case COMMON_REASONING_FORMAT_AUTO: return "auto";
656656
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
657657
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
658+
case COMMON_REASONING_FORMAT_MINIMAX_M2: return "minimax-m2";
658659
default:
659660
throw std::runtime_error("Unknown reasoning format");
660661
}
@@ -669,6 +670,8 @@ common_reasoning_format common_reasoning_format_from_name(const std::string & fo
669670
return COMMON_REASONING_FORMAT_DEEPSEEK;
670671
} else if (format == "deepseek-legacy") {
671672
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
673+
} else if (format == "minimax-m2") {
674+
return COMMON_REASONING_FORMAT_MINIMAX_M2;
672675
}
673676
throw std::runtime_error("Unknown reasoning format: " + format);
674677
}
@@ -1790,7 +1793,8 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
17901793
// </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
17911794
common_chat_parse_deepseek_v3_1_content(builder);
17921795
} else {
1793-
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1796+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE ||
1797+
builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) {
17941798
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
17951799
common_chat_parse_deepseek_v3_1_content(builder);
17961800
return;
@@ -2001,7 +2005,9 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
20012005

20022006
if (regex_match(analysis_regex, header)) {
20032007
builder.move_to(header_start_pos);
2004-
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
2008+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE ||
2009+
builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2 ||
2010+
builder.syntax().reasoning_in_content) {
20052011
builder.add_content(consume_end(true));
20062012
} else {
20072013
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ enum common_reasoning_format {
249249
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
250250
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
251251
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
252+
COMMON_REASONING_FORMAT_MINIMAX_M2, // Stream a synthetic opening <think> tag and keep </think> tags in `message.content` for MiniMax-M2 compatibility
252253
// do not extend this enum unless you absolutely have to
253254
// in most cases, use COMMON_REASONING_FORMAT_AUTO
254255
// see: https://github.com/ggml-org/llama.cpp/pull/15408

tools/server/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co
190190
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
191191
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
192192
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
193-
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
193+
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>- minimax-m2: Stream a synthetic opening <think> tag and keep </think> tags in `message.content` for MiniMax-M2 compatibility<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
194194
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
195195
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
196196
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |

tools/server/server.cpp

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,10 @@ struct server_task {
443443
}
444444
common_reasoning_format reasoning_format = params_base.reasoning_format;
445445
if (data.contains("reasoning_format")) {
446-
reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
446+
const auto requested = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
447+
if (requested != COMMON_REASONING_FORMAT_AUTO) {
448+
reasoning_format = requested;
449+
}
447450
}
448451
params.oaicompat_chat_syntax.reasoning_format = reasoning_format;
449452
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
@@ -1660,6 +1663,7 @@ struct server_slot {
16601663
bool has_next_token = true;
16611664
bool has_new_line = false;
16621665
bool truncated = false;
1666+
bool minimax_reasoning_prefix_injected = false;
16631667

16641668
stop_type stop;
16651669

@@ -1730,6 +1734,7 @@ struct server_slot {
17301734
generated_text = "";
17311735
has_new_line = false;
17321736
truncated = false;
1737+
minimax_reasoning_prefix_injected = false;
17331738
stop = STOP_TYPE_NONE;
17341739
stopping_word = "";
17351740
n_sent_text = 0;
@@ -1856,9 +1861,13 @@ struct server_slot {
18561861
GGML_ASSERT(task);
18571862

18581863
auto previous_msg = chat_msg;
1859-
SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
1864+
std::string text_to_parse = generated_text;
1865+
if (minimax_reasoning_prefix_injected) {
1866+
text_to_parse.insert(0, "<think>\n");
1867+
}
1868+
SRV_DBG("Parsing chat message: %s\n", text_to_parse.c_str());
18601869
auto new_msg = common_chat_parse(
1861-
generated_text,
1870+
text_to_parse,
18621871
/* is_partial= */ stop != STOP_TYPE_EOS,
18631872
task->params.oaicompat_chat_syntax);
18641873
if (!new_msg.empty()) {
@@ -2793,6 +2802,19 @@ struct server_context {
27932802

27942803
slot.state = SLOT_STATE_STARTED;
27952804

2805+
const bool needs_minimax_prefix =
2806+
slot.task->params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2;
2807+
if (needs_minimax_prefix) {
2808+
slot.minimax_reasoning_prefix_injected = true;
2809+
if (slot.task->params.stream) {
2810+
completion_token_output prefix_chunk{};
2811+
prefix_chunk.tok = LLAMA_TOKEN_NULL;
2812+
prefix_chunk.prob = 0.0f;
2813+
prefix_chunk.text_to_send = "<think>\n";
2814+
send_partial_response(slot, prefix_chunk, false);
2815+
}
2816+
}
2817+
27962818
SLT_INF(slot, "%s", "processing task\n");
27972819

27982820
return true;
@@ -2848,7 +2870,10 @@ struct server_context {
28482870
result.text_to_send = "";
28492871
}
28502872

2873+
std::string delta_to_send = result.text_to_send;
2874+
result.text_to_send = token_str;
28512875
slot.add_token(result);
2876+
result.text_to_send = std::move(delta_to_send);
28522877
if (slot.task->params.stream) {
28532878
send_partial_response(slot, result, false);
28542879
}
@@ -3021,7 +3046,11 @@ struct server_context {
30213046
return true;
30223047
}
30233048

3024-
void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) {
3049+
void send_partial_response(
3050+
server_slot & slot,
3051+
const completion_token_output & tkn,
3052+
bool is_progress,
3053+
const std::vector<common_chat_msg_diff> * forced_diffs = nullptr) {
30253054
auto res = std::make_unique<server_task_result_cmpl_partial>();
30263055

30273056
res->id = slot.task->id;
@@ -3035,9 +3064,15 @@ struct server_context {
30353064
res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt / 1000);
30363065
} else {
30373066
res->content = tkn.text_to_send;
3038-
res->tokens = { tkn.tok };
3067+
if (tkn.tok != LLAMA_TOKEN_NULL) {
3068+
res->tokens = { tkn.tok };
3069+
}
30393070

3040-
slot.update_chat_msg(res->oaicompat_msg_diffs);
3071+
if (forced_diffs) {
3072+
res->oaicompat_msg_diffs = *forced_diffs;
3073+
} else {
3074+
slot.update_chat_msg(res->oaicompat_msg_diffs);
3075+
}
30413076
}
30423077

30433078
res->n_decoded = slot.n_decoded;
@@ -3050,7 +3085,7 @@ struct server_context {
30503085
res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id;
30513086

30523087
// populate res.probs_output
3053-
if (slot.task->params.sampling.n_probs > 0) {
3088+
if (slot.task->params.sampling.n_probs > 0 && tkn.tok != LLAMA_TOKEN_NULL) {
30543089
res->prob_output = tkn; // copy the token probs
30553090
}
30563091

@@ -3068,8 +3103,12 @@ struct server_context {
30683103
res->id = slot.task->id;
30693104
res->id_slot = slot.id;
30703105

3071-
res->index = slot.task->index;
3072-
res->content = slot.generated_text;
3106+
res->index = slot.task->index;
3107+
std::string response_content = slot.generated_text;
3108+
if (slot.minimax_reasoning_prefix_injected) {
3109+
response_content.insert(0, "<think>\n");
3110+
}
3111+
res->content = std::move(response_content);
30733112
res->tokens = std::move(slot.generated_tokens);
30743113
res->timings = slot.get_timings();
30753114
res->prompt = slot.task->tokens.detokenize(ctx, true);

0 commit comments

Comments
 (0)