@@ -1664,6 +1664,7 @@ struct server_slot {
16641664 bool has_new_line = false ;
16651665 bool truncated = false ;
16661666 bool minimax_reasoning_prefix_injected = false ;
1667+ bool minimax_reasoning_prefix_streamed = false ;
16671668
16681669 stop_type stop;
16691670
@@ -1735,6 +1736,7 @@ struct server_slot {
17351736 has_new_line = false ;
17361737 truncated = false ;
17371738 minimax_reasoning_prefix_injected = false ;
1739+ minimax_reasoning_prefix_streamed = false ;
17381740 stop = STOP_TYPE_NONE;
17391741 stopping_word = " " ;
17401742 n_sent_text = 0 ;
@@ -2844,16 +2846,8 @@ struct server_context {
28442846
28452847 const bool needs_minimax_prefix =
28462848 slot.task ->params .oaicompat_chat_syntax .reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2;
2847- if (needs_minimax_prefix) {
2848- slot.minimax_reasoning_prefix_injected = true ;
2849- if (slot.task ->params .stream ) {
2850- completion_token_output prefix_chunk{};
2851- prefix_chunk.tok = LLAMA_TOKEN_NULL;
2852- prefix_chunk.prob = 0 .0f ;
2853- prefix_chunk.text_to_send = " <think>\n " ;
2854- send_partial_response (slot, prefix_chunk, false );
2855- }
2856- }
2849+ slot.minimax_reasoning_prefix_injected = needs_minimax_prefix;
2850+ slot.minimax_reasoning_prefix_streamed = false ;
28572851
28582852 SLT_INF (slot, " %s" , " processing task\n " );
28592853
@@ -2914,8 +2908,26 @@ struct server_context {
29142908 result.text_to_send = token_str;
29152909 slot.add_token (result);
29162910 result.text_to_send = std::move (delta_to_send);
2917- if (slot.task ->params .stream ) {
2918- send_partial_response (slot, result, false );
2911+
2912+ auto stream_with_minimax_prefix = [&](const completion_token_output & chunk) {
2913+ if (!slot.task ->params .stream ) {
2914+ return ;
2915+ }
2916+
2917+ if (slot.minimax_reasoning_prefix_injected && !slot.minimax_reasoning_prefix_streamed ) {
2918+ completion_token_output prefix_chunk{};
2919+ prefix_chunk.tok = LLAMA_TOKEN_NULL;
2920+ prefix_chunk.prob = 0 .0f ;
2921+ prefix_chunk.text_to_send = " <think>\n " ;
2922+ send_partial_response (slot, prefix_chunk, false );
2923+ slot.minimax_reasoning_prefix_streamed = true ;
2924+ }
2925+
2926+ send_partial_response (slot, chunk, false );
2927+ };
2928+
2929+ if (send_text) {
2930+ stream_with_minimax_prefix (result);
29192931 }
29202932 }
29212933
0 commit comments