@@ -1663,8 +1663,7 @@ struct server_slot {
16631663    bool  has_next_token = true ;
16641664    bool  has_new_line   = false ;
16651665    bool  truncated      = false ;
1666-     bool  minimax_reasoning_prefix_injected = false ;
1667-     bool  minimax_reasoning_prefix_streamed = false ;
1666+     common_chat_stream_state reasoning_stream_state;
16681667
16691668    stop_type stop;
16701669
@@ -1735,8 +1734,7 @@ struct server_slot {
17351734        generated_text = " "  ;
17361735        has_new_line   = false ;
17371736        truncated      = false ;
1738-         minimax_reasoning_prefix_injected = false ;
1739-         minimax_reasoning_prefix_streamed  = false ;
1737+         reasoning_stream_state = {};
17401738        stop           = STOP_TYPE_NONE;
17411739        stopping_word  = " "  ;
17421740        n_sent_text    = 0 ;
@@ -1863,14 +1861,12 @@ struct server_slot {
18631861        GGML_ASSERT (task);
18641862
18651863        auto  previous_msg = chat_msg;
1866-         std::string text_to_parse = generated_text;
1867-         if  (minimax_reasoning_prefix_injected) {
1868-             text_to_parse.insert (0 , " <think>\n "  );
1869-         }
1864+         const  auto  text_to_parse = reasoning_stream_state.apply_reasoning_prefix (generated_text);
18701865        SRV_DBG (" Parsing chat message: %s\n "  , text_to_parse.c_str ());
1871-         auto  new_msg = common_chat_parse (
1872-             text_to_parse ,
1866+         auto  new_msg = common_chat_parse_stream (
1867+             generated_text ,
18731868            /*  is_partial= */   stop != STOP_TYPE_EOS,
1869+             reasoning_stream_state,
18741870            task->params .oaicompat_chat_syntax );
18751871        if  (!new_msg.empty ()) {
18761872            new_msg.set_tool_call_ids (generated_tool_call_ids, gen_tool_call_id);
@@ -2843,10 +2839,7 @@ struct server_context {
28432839
28442840        slot.state  = SLOT_STATE_STARTED;
28452841
2846-         const  bool  needs_minimax_prefix =
2847-             slot.task ->params .oaicompat_chat_syntax .reasoning_format  == COMMON_REASONING_FORMAT_MINIMAX_M2;
2848-         slot.minimax_reasoning_prefix_injected  = needs_minimax_prefix;
2849-         slot.minimax_reasoning_prefix_streamed  = false ;
2842+         slot.reasoning_stream_state .init (slot.task ->params .oaicompat_chat_syntax );
28502843
28512844        SLT_INF (slot, " %s"  , " processing task\n "  );
28522845
@@ -2908,25 +2901,16 @@ struct server_context {
29082901            slot.add_token (result);
29092902            result.text_to_send  = std::move (delta_to_send);
29102903
2911-             auto  stream_with_minimax_prefix = [&](const  completion_token_output & chunk) {
2912-                 if  (!slot.task ->params .stream ) {
2913-                     return ;
2914-                 }
2915- 
2916-                 if  (slot.minimax_reasoning_prefix_injected  && !slot.minimax_reasoning_prefix_streamed ) {
2904+             if  (send_text && slot.task ->params .stream ) {
2905+                 if  (auto  prefix = slot.reasoning_stream_state .consume_reasoning_prefix ()) {
29172906                    completion_token_output prefix_chunk{};
29182907                    prefix_chunk.tok           = LLAMA_TOKEN_NULL;
29192908                    prefix_chunk.prob          = 0 .0f ;
2920-                     prefix_chunk.text_to_send  = " <think> \n "  ;
2909+                     prefix_chunk.text_to_send  = *prefix ;
29212910                    send_partial_response (slot, prefix_chunk, false );
2922-                     slot.minimax_reasoning_prefix_streamed  = true ;
29232911                }
29242912
2925-                 send_partial_response (slot, chunk, false );
2926-             };
2927- 
2928-             if  (send_text) {
2929-                 stream_with_minimax_prefix (result);
2913+                 send_partial_response (slot, result, false );
29302914            }
29312915        }
29322916
@@ -3097,11 +3081,7 @@ struct server_context {
30973081        return  true ;
30983082    }
30993083
3100-     void  send_partial_response (
3101-             server_slot & slot,
3102-             const  completion_token_output & tkn,
3103-             bool  is_progress,
3104-             const  std::vector<common_chat_msg_diff> * forced_diffs = nullptr ) {
3084+     void  send_partial_response (server_slot & slot, const  completion_token_output & tkn, bool  is_progress) {
31053085        auto  res = std::make_unique<server_task_result_cmpl_partial>();
31063086
31073087        res->id     = slot.task ->id ;
@@ -3119,11 +3099,7 @@ struct server_context {
31193099                res->tokens  = { tkn.tok  };
31203100            }
31213101
3122-             if  (forced_diffs) {
3123-                 res->oaicompat_msg_diffs  = *forced_diffs;
3124-             } else  {
3125-                 slot.update_chat_msg (res->oaicompat_msg_diffs );
3126-             }
3102+             slot.update_chat_msg (res->oaicompat_msg_diffs );
31273103        }
31283104
31293105        res->n_decoded            = slot.n_decoded ;
@@ -3154,12 +3130,8 @@ struct server_context {
31543130        res->id       = slot.task ->id ;
31553131        res->id_slot  = slot.id ;
31563132
3157-         res->index    = slot.task ->index ;
3158-         std::string response_content = slot.generated_text ;
3159-         if  (slot.minimax_reasoning_prefix_injected ) {
3160-             response_content.insert (0 , " <think>\n "  );
3161-         }
3162-         res->content          = std::move (response_content);
3133+         res->index            = slot.task ->index ;
3134+         res->content          = slot.reasoning_stream_state .apply_reasoning_prefix (slot.generated_text );
31633135        res->tokens           = std::move (slot.generated_tokens );
31643136        res->timings          = slot.get_timings ();
31653137        res->prompt           = slot.task ->tokens .detokenize (ctx, true );
0 commit comments