Skip to content

Commit 39351b1

Browse files
server: defer minimax-m2 synthetic <think> until first generated token
1 parent f55b71f commit 39351b1

File tree

1 file changed

+24
-12
lines changed

1 file changed

+24
-12
lines changed

tools/server/server.cpp

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1664,6 +1664,7 @@ struct server_slot {
16641664
bool has_new_line = false;
16651665
bool truncated = false;
16661666
bool minimax_reasoning_prefix_injected = false;
1667+
bool minimax_reasoning_prefix_streamed = false;
16671668

16681669
stop_type stop;
16691670

@@ -1735,6 +1736,7 @@ struct server_slot {
17351736
has_new_line = false;
17361737
truncated = false;
17371738
minimax_reasoning_prefix_injected = false;
1739+
minimax_reasoning_prefix_streamed = false;
17381740
stop = STOP_TYPE_NONE;
17391741
stopping_word = "";
17401742
n_sent_text = 0;
@@ -2804,16 +2806,8 @@ struct server_context {
28042806

28052807
const bool needs_minimax_prefix =
28062808
slot.task->params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2;
2807-
if (needs_minimax_prefix) {
2808-
slot.minimax_reasoning_prefix_injected = true;
2809-
if (slot.task->params.stream) {
2810-
completion_token_output prefix_chunk{};
2811-
prefix_chunk.tok = LLAMA_TOKEN_NULL;
2812-
prefix_chunk.prob = 0.0f;
2813-
prefix_chunk.text_to_send = "<think>\n";
2814-
send_partial_response(slot, prefix_chunk, false);
2815-
}
2816-
}
2809+
slot.minimax_reasoning_prefix_injected = needs_minimax_prefix;
2810+
slot.minimax_reasoning_prefix_streamed = false;
28172811

28182812
SLT_INF(slot, "%s", "processing task\n");
28192813

@@ -2874,8 +2868,26 @@ struct server_context {
28742868
result.text_to_send = token_str;
28752869
slot.add_token(result);
28762870
result.text_to_send = std::move(delta_to_send);
2877-
if (slot.task->params.stream) {
2878-
send_partial_response(slot, result, false);
2871+
2872+
auto stream_with_minimax_prefix = [&](const completion_token_output & chunk) {
2873+
if (!slot.task->params.stream) {
2874+
return;
2875+
}
2876+
2877+
if (slot.minimax_reasoning_prefix_injected && !slot.minimax_reasoning_prefix_streamed) {
2878+
completion_token_output prefix_chunk{};
2879+
prefix_chunk.tok = LLAMA_TOKEN_NULL;
2880+
prefix_chunk.prob = 0.0f;
2881+
prefix_chunk.text_to_send = "<think>\n";
2882+
send_partial_response(slot, prefix_chunk, false);
2883+
slot.minimax_reasoning_prefix_streamed = true;
2884+
}
2885+
2886+
send_partial_response(slot, chunk, false);
2887+
};
2888+
2889+
if (send_text) {
2890+
stream_with_minimax_prefix(result);
28792891
}
28802892
}
28812893

0 commit comments

Comments
 (0)