@@ -1664,6 +1664,7 @@ struct server_slot {
16641664 bool has_new_line = false ;
16651665 bool truncated = false ;
16661666 bool minimax_reasoning_prefix_injected = false ;
1667+ bool minimax_reasoning_prefix_streamed = false ;
16671668
16681669 stop_type stop;
16691670
@@ -1735,6 +1736,7 @@ struct server_slot {
17351736 has_new_line = false ;
17361737 truncated = false ;
17371738 minimax_reasoning_prefix_injected = false ;
1739+ minimax_reasoning_prefix_streamed = false ;
17381740 stop = STOP_TYPE_NONE;
17391741 stopping_word = " " ;
17401742 n_sent_text = 0 ;
@@ -2843,16 +2845,8 @@ struct server_context {
28432845
28442846 const bool needs_minimax_prefix =
28452847 slot.task ->params .oaicompat_chat_syntax .reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2;
2846- if (needs_minimax_prefix) {
2847- slot.minimax_reasoning_prefix_injected = true ;
2848- if (slot.task ->params .stream ) {
2849- completion_token_output prefix_chunk{};
2850- prefix_chunk.tok = LLAMA_TOKEN_NULL;
2851- prefix_chunk.prob = 0 .0f ;
2852- prefix_chunk.text_to_send = " <think>\n " ;
2853- send_partial_response (slot, prefix_chunk, false );
2854- }
2855- }
2848+ slot.minimax_reasoning_prefix_injected = needs_minimax_prefix;
2849+ slot.minimax_reasoning_prefix_streamed = false ;
28562850
28572851 SLT_INF (slot, " %s" , " processing task\n " );
28582852
@@ -2913,8 +2907,26 @@ struct server_context {
29132907 result.text_to_send = token_str;
29142908 slot.add_token (result);
29152909 result.text_to_send = std::move (delta_to_send);
2916- if (slot.task ->params .stream ) {
2917- send_partial_response (slot, result, false );
2910+
2911+ auto stream_with_minimax_prefix = [&](const completion_token_output & chunk) {
2912+ if (!slot.task ->params .stream ) {
2913+ return ;
2914+ }
2915+
2916+ if (slot.minimax_reasoning_prefix_injected && !slot.minimax_reasoning_prefix_streamed ) {
2917+ completion_token_output prefix_chunk{};
2918+ prefix_chunk.tok = LLAMA_TOKEN_NULL;
2919+ prefix_chunk.prob = 0 .0f ;
2920+ prefix_chunk.text_to_send = " <think>\n " ;
2921+ send_partial_response (slot, prefix_chunk, false );
2922+ slot.minimax_reasoning_prefix_streamed = true ;
2923+ }
2924+
2925+ send_partial_response (slot, chunk, false );
2926+ };
2927+
2928+ if (send_text) {
2929+ stream_with_minimax_prefix (result);
29182930 }
29192931 }
29202932
0 commit comments