@@ -1663,8 +1663,7 @@ struct server_slot {
16631663    bool  has_next_token = true ;
16641664    bool  has_new_line   = false ;
16651665    bool  truncated      = false ;
1666-     bool  minimax_reasoning_prefix_injected = false ;
1667-     bool  minimax_reasoning_prefix_streamed = false ;
1666+     common_chat_stream_state reasoning_stream_state;
16681667
16691668    stop_type stop;
16701669
@@ -1735,8 +1734,7 @@ struct server_slot {
17351734        generated_text = " "  ;
17361735        has_new_line   = false ;
17371736        truncated      = false ;
1738-         minimax_reasoning_prefix_injected = false ;
1739-         minimax_reasoning_prefix_streamed  = false ;
1737+         reasoning_stream_state = {};
17401738        stop           = STOP_TYPE_NONE;
17411739        stopping_word  = " "  ;
17421740        n_sent_text    = 0 ;
@@ -1863,14 +1861,12 @@ struct server_slot {
18631861        GGML_ASSERT (task);
18641862
18651863        auto  previous_msg = chat_msg;
1866-         std::string text_to_parse = generated_text;
1867-         if  (minimax_reasoning_prefix_injected) {
1868-             text_to_parse.insert (0 , " <think>\n "  );
1869-         }
1864+         const  auto  text_to_parse = reasoning_stream_state.apply_reasoning_prefix (generated_text);
18701865        SRV_DBG (" Parsing chat message: %s\n "  , text_to_parse.c_str ());
1871-         auto  new_msg = common_chat_parse (
1872-             text_to_parse ,
1866+         auto  new_msg = common_chat_parse_stream (
1867+             generated_text ,
18731868            /*  is_partial= */   stop != STOP_TYPE_EOS,
1869+             reasoning_stream_state,
18741870            task->params .oaicompat_chat_syntax );
18751871        if  (!new_msg.empty ()) {
18761872            new_msg.set_tool_call_ids (generated_tool_call_ids, gen_tool_call_id);
@@ -2844,10 +2840,7 @@ struct server_context {
28442840
28452841        slot.state  = SLOT_STATE_STARTED;
28462842
2847-         const  bool  needs_minimax_prefix =
2848-             slot.task ->params .oaicompat_chat_syntax .reasoning_format  == COMMON_REASONING_FORMAT_MINIMAX_M2;
2849-         slot.minimax_reasoning_prefix_injected  = needs_minimax_prefix;
2850-         slot.minimax_reasoning_prefix_streamed  = false ;
2843+         slot.reasoning_stream_state .init (slot.task ->params .oaicompat_chat_syntax );
28512844
28522845        SLT_INF (slot, " %s"  , " processing task\n "  );
28532846
@@ -2909,25 +2902,16 @@ struct server_context {
29092902            slot.add_token (result);
29102903            result.text_to_send  = std::move (delta_to_send);
29112904
2912-             auto  stream_with_minimax_prefix = [&](const  completion_token_output & chunk) {
2913-                 if  (!slot.task ->params .stream ) {
2914-                     return ;
2915-                 }
2916- 
2917-                 if  (slot.minimax_reasoning_prefix_injected  && !slot.minimax_reasoning_prefix_streamed ) {
2905+             if  (send_text && slot.task ->params .stream ) {
2906+                 if  (auto  prefix = slot.reasoning_stream_state .consume_reasoning_prefix ()) {
29182907                    completion_token_output prefix_chunk{};
29192908                    prefix_chunk.tok           = LLAMA_TOKEN_NULL;
29202909                    prefix_chunk.prob          = 0 .0f ;
2921-                     prefix_chunk.text_to_send  = " <think> \n "  ;
2910+                     prefix_chunk.text_to_send  = *prefix ;
29222911                    send_partial_response (slot, prefix_chunk, false );
2923-                     slot.minimax_reasoning_prefix_streamed  = true ;
29242912                }
29252913
2926-                 send_partial_response (slot, chunk, false );
2927-             };
2928- 
2929-             if  (send_text) {
2930-                 stream_with_minimax_prefix (result);
2914+                 send_partial_response (slot, result, false );
29312915            }
29322916        }
29332917
@@ -3098,11 +3082,7 @@ struct server_context {
30983082        return  true ;
30993083    }
31003084
3101-     void  send_partial_response (
3102-             server_slot & slot,
3103-             const  completion_token_output & tkn,
3104-             bool  is_progress,
3105-             const  std::vector<common_chat_msg_diff> * forced_diffs = nullptr ) {
3085+     void  send_partial_response (server_slot & slot, const  completion_token_output & tkn, bool  is_progress) {
31063086        auto  res = std::make_unique<server_task_result_cmpl_partial>();
31073087
31083088        res->id     = slot.task ->id ;
@@ -3120,11 +3100,7 @@ struct server_context {
31203100                res->tokens  = { tkn.tok  };
31213101            }
31223102
3123-             if  (forced_diffs) {
3124-                 res->oaicompat_msg_diffs  = *forced_diffs;
3125-             } else  {
3126-                 slot.update_chat_msg (res->oaicompat_msg_diffs );
3127-             }
3103+             slot.update_chat_msg (res->oaicompat_msg_diffs );
31283104        }
31293105
31303106        res->n_decoded            = slot.n_decoded ;
@@ -3155,12 +3131,8 @@ struct server_context {
31553131        res->id       = slot.task ->id ;
31563132        res->id_slot  = slot.id ;
31573133
3158-         res->index    = slot.task ->index ;
3159-         std::string response_content = slot.generated_text ;
3160-         if  (slot.minimax_reasoning_prefix_injected ) {
3161-             response_content.insert (0 , " <think>\n "  );
3162-         }
3163-         res->content          = std::move (response_content);
3134+         res->index            = slot.task ->index ;
3135+         res->content          = slot.reasoning_stream_state .apply_reasoning_prefix (slot.generated_text );
31643136        res->tokens           = std::move (slot.generated_tokens );
31653137        res->timings          = slot.get_timings ();
31663138        res->prompt           = slot.task ->tokens .detokenize (ctx, true );
0 commit comments