Skip to content

Commit b654838

Browse files
server: defer minimax-m2 synthetic <think> until first generated token
1 parent d225bed commit b654838

File tree

1 file changed

+24
-12
lines changed

1 file changed

+24
-12
lines changed

tools/server/server.cpp

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1664,6 +1664,7 @@ struct server_slot {
16641664
bool has_new_line = false;
16651665
bool truncated = false;
16661666
bool minimax_reasoning_prefix_injected = false;
1667+
bool minimax_reasoning_prefix_streamed = false;
16671668

16681669
stop_type stop;
16691670

@@ -1735,6 +1736,7 @@ struct server_slot {
17351736
has_new_line = false;
17361737
truncated = false;
17371738
minimax_reasoning_prefix_injected = false;
1739+
minimax_reasoning_prefix_streamed = false;
17381740
stop = STOP_TYPE_NONE;
17391741
stopping_word = "";
17401742
n_sent_text = 0;
@@ -2843,16 +2845,8 @@ struct server_context {
28432845

28442846
const bool needs_minimax_prefix =
28452847
slot.task->params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2;
2846-
if (needs_minimax_prefix) {
2847-
slot.minimax_reasoning_prefix_injected = true;
2848-
if (slot.task->params.stream) {
2849-
completion_token_output prefix_chunk{};
2850-
prefix_chunk.tok = LLAMA_TOKEN_NULL;
2851-
prefix_chunk.prob = 0.0f;
2852-
prefix_chunk.text_to_send = "<think>\n";
2853-
send_partial_response(slot, prefix_chunk, false);
2854-
}
2855-
}
2848+
slot.minimax_reasoning_prefix_injected = needs_minimax_prefix;
2849+
slot.minimax_reasoning_prefix_streamed = false;
28562850

28572851
SLT_INF(slot, "%s", "processing task\n");
28582852

@@ -2913,8 +2907,26 @@ struct server_context {
29132907
result.text_to_send = token_str;
29142908
slot.add_token(result);
29152909
result.text_to_send = std::move(delta_to_send);
2916-
if (slot.task->params.stream) {
2917-
send_partial_response(slot, result, false);
2910+
2911+
auto stream_with_minimax_prefix = [&](const completion_token_output & chunk) {
2912+
if (!slot.task->params.stream) {
2913+
return;
2914+
}
2915+
2916+
if (slot.minimax_reasoning_prefix_injected && !slot.minimax_reasoning_prefix_streamed) {
2917+
completion_token_output prefix_chunk{};
2918+
prefix_chunk.tok = LLAMA_TOKEN_NULL;
2919+
prefix_chunk.prob = 0.0f;
2920+
prefix_chunk.text_to_send = "<think>\n";
2921+
send_partial_response(slot, prefix_chunk, false);
2922+
slot.minimax_reasoning_prefix_streamed = true;
2923+
}
2924+
2925+
send_partial_response(slot, chunk, false);
2926+
};
2927+
2928+
if (send_text) {
2929+
stream_with_minimax_prefix(result);
29182930
}
29192931
}
29202932

0 commit comments

Comments
 (0)