Skip to content

Commit cdf17b6

Browse files
server: defer minimax-m2 synthetic <think> until first generated token
1 parent 18a25de commit cdf17b6

File tree

1 file changed

+24
-12
lines changed

1 file changed

+24
-12
lines changed

tools/server/server.cpp

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1664,6 +1664,7 @@ struct server_slot {
16641664
bool has_new_line = false;
16651665
bool truncated = false;
16661666
bool minimax_reasoning_prefix_injected = false;
1667+
bool minimax_reasoning_prefix_streamed = false;
16671668

16681669
stop_type stop;
16691670

@@ -1735,6 +1736,7 @@ struct server_slot {
17351736
has_new_line = false;
17361737
truncated = false;
17371738
minimax_reasoning_prefix_injected = false;
1739+
minimax_reasoning_prefix_streamed = false;
17381740
stop = STOP_TYPE_NONE;
17391741
stopping_word = "";
17401742
n_sent_text = 0;
@@ -2844,16 +2846,8 @@ struct server_context {
28442846

28452847
const bool needs_minimax_prefix =
28462848
slot.task->params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2;
2847-
if (needs_minimax_prefix) {
2848-
slot.minimax_reasoning_prefix_injected = true;
2849-
if (slot.task->params.stream) {
2850-
completion_token_output prefix_chunk{};
2851-
prefix_chunk.tok = LLAMA_TOKEN_NULL;
2852-
prefix_chunk.prob = 0.0f;
2853-
prefix_chunk.text_to_send = "<think>\n";
2854-
send_partial_response(slot, prefix_chunk, false);
2855-
}
2856-
}
2849+
slot.minimax_reasoning_prefix_injected = needs_minimax_prefix;
2850+
slot.minimax_reasoning_prefix_streamed = false;
28572851

28582852
SLT_INF(slot, "%s", "processing task\n");
28592853

@@ -2914,8 +2908,26 @@ struct server_context {
29142908
result.text_to_send = token_str;
29152909
slot.add_token(result);
29162910
result.text_to_send = std::move(delta_to_send);
2917-
if (slot.task->params.stream) {
2918-
send_partial_response(slot, result, false);
2911+
2912+
auto stream_with_minimax_prefix = [&](const completion_token_output & chunk) {
2913+
if (!slot.task->params.stream) {
2914+
return;
2915+
}
2916+
2917+
if (slot.minimax_reasoning_prefix_injected && !slot.minimax_reasoning_prefix_streamed) {
2918+
completion_token_output prefix_chunk{};
2919+
prefix_chunk.tok = LLAMA_TOKEN_NULL;
2920+
prefix_chunk.prob = 0.0f;
2921+
prefix_chunk.text_to_send = "<think>\n";
2922+
send_partial_response(slot, prefix_chunk, false);
2923+
slot.minimax_reasoning_prefix_streamed = true;
2924+
}
2925+
2926+
send_partial_response(slot, chunk, false);
2927+
};
2928+
2929+
if (send_text) {
2930+
stream_with_minimax_prefix(result);
29192931
}
29202932
}
29212933

0 commit comments

Comments
 (0)