@@ -1663,8 +1663,7 @@ struct server_slot {
16631663 bool has_next_token = true ;
16641664 bool has_new_line = false ;
16651665 bool truncated = false ;
1666- bool minimax_reasoning_prefix_injected = false ;
1667- bool minimax_reasoning_prefix_streamed = false ;
1666+ common_chat_stream_state reasoning_stream_state;
16681667
16691668 stop_type stop;
16701669
@@ -1735,8 +1734,7 @@ struct server_slot {
17351734 generated_text = " " ;
17361735 has_new_line = false ;
17371736 truncated = false ;
1738- minimax_reasoning_prefix_injected = false ;
1739- minimax_reasoning_prefix_streamed = false ;
1737+ reasoning_stream_state = {};
17401738 stop = STOP_TYPE_NONE;
17411739 stopping_word = " " ;
17421740 n_sent_text = 0 ;
@@ -1863,14 +1861,12 @@ struct server_slot {
18631861 GGML_ASSERT (task);
18641862
18651863 auto previous_msg = chat_msg;
1866- std::string text_to_parse = generated_text;
1867- if (minimax_reasoning_prefix_injected) {
1868- text_to_parse.insert (0 , " <think>\n " );
1869- }
1864+ const auto text_to_parse = reasoning_stream_state.apply_reasoning_prefix (generated_text);
18701865 SRV_DBG (" Parsing chat message: %s\n " , text_to_parse.c_str ());
1871- auto new_msg = common_chat_parse (
1872- text_to_parse ,
1866+ auto new_msg = common_chat_parse_stream (
1867+ generated_text ,
18731868 /* is_partial= */ stop != STOP_TYPE_EOS,
1869+ reasoning_stream_state,
18741870 task->params .oaicompat_chat_syntax );
18751871 if (!new_msg.empty ()) {
18761872 new_msg.set_tool_call_ids (generated_tool_call_ids, gen_tool_call_id);
@@ -2804,10 +2800,7 @@ struct server_context {
28042800
28052801 slot.state = SLOT_STATE_STARTED;
28062802
2807- const bool needs_minimax_prefix =
2808- slot.task ->params .oaicompat_chat_syntax .reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2;
2809- slot.minimax_reasoning_prefix_injected = needs_minimax_prefix;
2810- slot.minimax_reasoning_prefix_streamed = false ;
2803+ slot.reasoning_stream_state .init (slot.task ->params .oaicompat_chat_syntax );
28112804
28122805 SLT_INF (slot, " %s" , " processing task\n " );
28132806
@@ -2869,25 +2862,16 @@ struct server_context {
28692862 slot.add_token (result);
28702863 result.text_to_send = std::move (delta_to_send);
28712864
2872- auto stream_with_minimax_prefix = [&](const completion_token_output & chunk) {
2873- if (!slot.task ->params .stream ) {
2874- return ;
2875- }
2876-
2877- if (slot.minimax_reasoning_prefix_injected && !slot.minimax_reasoning_prefix_streamed ) {
2865+ if (send_text && slot.task ->params .stream ) {
2866+ if (auto prefix = slot.reasoning_stream_state .consume_reasoning_prefix ()) {
28782867 completion_token_output prefix_chunk{};
28792868 prefix_chunk.tok = LLAMA_TOKEN_NULL;
28802869 prefix_chunk.prob = 0 .0f ;
2881- prefix_chunk.text_to_send = " <think> \n " ;
2870+ prefix_chunk.text_to_send = *prefix ;
28822871 send_partial_response (slot, prefix_chunk, false );
2883- slot.minimax_reasoning_prefix_streamed = true ;
28842872 }
28852873
2886- send_partial_response (slot, chunk, false );
2887- };
2888-
2889- if (send_text) {
2890- stream_with_minimax_prefix (result);
2874+ send_partial_response (slot, result, false );
28912875 }
28922876 }
28932877
@@ -3058,11 +3042,7 @@ struct server_context {
30583042 return true ;
30593043 }
30603044
3061- void send_partial_response (
3062- server_slot & slot,
3063- const completion_token_output & tkn,
3064- bool is_progress,
3065- const std::vector<common_chat_msg_diff> * forced_diffs = nullptr ) {
3045+ void send_partial_response (server_slot & slot, const completion_token_output & tkn, bool is_progress) {
30663046 auto res = std::make_unique<server_task_result_cmpl_partial>();
30673047
30683048 res->id = slot.task ->id ;
@@ -3080,11 +3060,7 @@ struct server_context {
30803060 res->tokens = { tkn.tok };
30813061 }
30823062
3083- if (forced_diffs) {
3084- res->oaicompat_msg_diffs = *forced_diffs;
3085- } else {
3086- slot.update_chat_msg (res->oaicompat_msg_diffs );
3087- }
3063+ slot.update_chat_msg (res->oaicompat_msg_diffs );
30883064 }
30893065
30903066 res->n_decoded = slot.n_decoded ;
@@ -3115,12 +3091,8 @@ struct server_context {
31153091 res->id = slot.task ->id ;
31163092 res->id_slot = slot.id ;
31173093
3118- res->index = slot.task ->index ;
3119- std::string response_content = slot.generated_text ;
3120- if (slot.minimax_reasoning_prefix_injected ) {
3121- response_content.insert (0 , " <think>\n " );
3122- }
3123- res->content = std::move (response_content);
3094+ res->index = slot.task ->index ;
3095+ res->content = slot.reasoning_stream_state .apply_reasoning_prefix (slot.generated_text );
31243096 res->tokens = std::move (slot.generated_tokens );
31253097 res->timings = slot.get_timings ();
31263098 res->prompt = slot.task ->tokens .detokenize (ctx, true );
0 commit comments