@@ -1664,6 +1664,7 @@ struct server_slot {
16641664 bool has_new_line = false ;
16651665 bool truncated = false ;
16661666 bool minimax_reasoning_prefix_injected = false ;
1667+ bool minimax_reasoning_prefix_streamed = false ;
16671668
16681669 stop_type stop;
16691670
@@ -1735,6 +1736,7 @@ struct server_slot {
17351736 has_new_line = false ;
17361737 truncated = false ;
17371738 minimax_reasoning_prefix_injected = false ;
1739+ minimax_reasoning_prefix_streamed = false ;
17381740 stop = STOP_TYPE_NONE;
17391741 stopping_word = " " ;
17401742 n_sent_text = 0 ;
@@ -2804,16 +2806,8 @@ struct server_context {
28042806
28052807 const bool needs_minimax_prefix =
28062808 slot.task ->params .oaicompat_chat_syntax .reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2;
2807- if (needs_minimax_prefix) {
2808- slot.minimax_reasoning_prefix_injected = true ;
2809- if (slot.task ->params .stream ) {
2810- completion_token_output prefix_chunk{};
2811- prefix_chunk.tok = LLAMA_TOKEN_NULL;
2812- prefix_chunk.prob = 0 .0f ;
2813- prefix_chunk.text_to_send = " <think>\n " ;
2814- send_partial_response (slot, prefix_chunk, false );
2815- }
2816- }
2809+ slot.minimax_reasoning_prefix_injected = needs_minimax_prefix;
2810+ slot.minimax_reasoning_prefix_streamed = false ;
28172811
28182812 SLT_INF (slot, " %s" , " processing task\n " );
28192813
@@ -2874,8 +2868,26 @@ struct server_context {
28742868 result.text_to_send = token_str;
28752869 slot.add_token (result);
28762870 result.text_to_send = std::move (delta_to_send);
2877- if (slot.task ->params .stream ) {
2878- send_partial_response (slot, result, false );
2871+
2872+ auto stream_with_minimax_prefix = [&](const completion_token_output & chunk) {
2873+ if (!slot.task ->params .stream ) {
2874+ return ;
2875+ }
2876+
2877+ if (slot.minimax_reasoning_prefix_injected && !slot.minimax_reasoning_prefix_streamed ) {
2878+ completion_token_output prefix_chunk{};
2879+ prefix_chunk.tok = LLAMA_TOKEN_NULL;
2880+ prefix_chunk.prob = 0 .0f ;
2881+ prefix_chunk.text_to_send = " <think>\n " ;
2882+ send_partial_response (slot, prefix_chunk, false );
2883+ slot.minimax_reasoning_prefix_streamed = true ;
2884+ }
2885+
2886+ send_partial_response (slot, chunk, false );
2887+ };
2888+
2889+ if (send_text) {
2890+ stream_with_minimax_prefix (result);
28792891 }
28802892 }
28812893
0 commit comments