99import pickle
1010import uuid
1111
12+ from lightllm .server .reasoning_parser import ReasoningParser
13+
1214from .function_call_parser import TOOLS_TAG_LIST , FunctionCallParser , ToolCallItem
1315from .build_prompt import build_prompt , init_tokenizer
1416
1719from http import HTTPStatus
1820from PIL import Image
1921import multiprocessing as mp
20- from typing import AsyncGenerator , Union , List , Dict
22+ from typing import Any , AsyncGenerator , Optional , Union , List , Dict
2123from typing import Callable
2224from lightllm .server import TokenLoad
2325from fastapi import BackgroundTasks , FastAPI , Request , WebSocket , WebSocketDisconnect
@@ -109,6 +111,38 @@ def _get_history_tool_calls_cnt(request: ChatCompletionRequest) -> int:
109111 return idx
110112
111113
114+ def _get_reasoning_from_request (request : ChatCompletionRequest ) -> bool :
115+ """Judge whether the request needs reasoning"""
116+ reasoning_parser = get_env_start_args ().reasoning_parser
117+ if not reasoning_parser :
118+ return False
119+ if reasoning_parser in ["deepseek-v3" ]:
120+ return request .chat_template_kwargs is not None and request .chat_template_kwargs .get ("thinking" ) is True
121+ if reasoning_parser in ["qwen3" , "glm45" , "nano_v3" , "interns1" ]:
122+ # qwen3, glm45, nano_v3, and interns1 are reasoning by default
123+ return not request .chat_template_kwargs or request .chat_template_kwargs .get ("enable_thinking" , True ) is True
124+ return True # default
125+
126+
127+ def _process_reasoning_stream (
128+ index : int ,
129+ delta : str ,
130+ reasoning_parser_dict : Dict [int , ReasoningParser ],
131+ content : Dict [str , Any ],
132+ request : ChatCompletionRequest ,
133+ ) -> tuple [Optional [str ], str ]:
134+ """Process reasoning content in streaming response"""
135+ if index not in reasoning_parser_dict :
136+ request_enable_reasoning = _get_reasoning_from_request (request )
137+ reasoning_parser_dict [index ] = ReasoningParser (
138+ get_env_start_args ().reasoning_parser ,
139+ request .stream_reasoning ,
140+ request_enable_reasoning ,
141+ )
142+ reasoning_parser = reasoning_parser_dict [index ]
143+ return reasoning_parser .parse_stream_chunk (delta )
144+
145+
112146async def chat_completions_impl (request : ChatCompletionRequest , raw_request : Request ) -> Response :
113147 from .api_http import g_objs
114148
@@ -226,10 +260,30 @@ async def chat_completions_impl(request: ChatCompletionRequest, raw_request: Req
226260
227261 finish_reason = finish_reason_dict [sub_req_id ]
228262 text = "" .join (final_output_dict [sub_req_id ])
263+
264+ # Handle reasoning content
265+ reasoning_text = None
266+ reasoning_parser = get_env_start_args ().reasoning_parser
267+ if reasoning_parser and request .separate_reasoning :
268+ request_enable_reasoning = _get_reasoning_from_request (request )
269+ try :
270+ parser = ReasoningParser (
271+ model_type = reasoning_parser ,
272+ stream_reasoning = False ,
273+ force_reasoning = request_enable_reasoning ,
274+ )
275+ reasoning_text , text = parser .parse_non_stream (text )
276+ except Exception as e :
277+ logger .error (f"Reasoning parsing error: { e } " )
278+ return create_error_response (
279+ HTTPStatus .BAD_REQUEST ,
280+ "Failed to parse fc related info to json format!" ,
281+ )
282+
283+ # Handle tool_calls parsing
229284 tool_calls = None
230285 tool_choice = request .tool_choice
231286 tools = request .tools
232-
233287 if tool_choice != "none" and any ([i in text for i in TOOLS_TAG_LIST ]):
234288 if finish_reason == "stop" :
235289 finish_reason = "tool_calls"
@@ -257,7 +311,12 @@ async def chat_completions_impl(request: ChatCompletionRequest, raw_request: Req
257311 )
258312 if finish_reason == "tool_calls" :
259313 text = ""
260- chat_message = ChatMessage (role = "assistant" , content = text , tool_calls = tool_calls )
314+ chat_message = ChatMessage (
315+ role = "assistant" ,
316+ content = text if text else "" ,
317+ tool_calls = tool_calls ,
318+ reasoning_content = reasoning_text if reasoning_text else "" ,
319+ )
261320 choice = ChatCompletionResponseChoice (
262321 index = i ,
263322 message = chat_message ,
@@ -273,6 +332,7 @@ async def chat_completions_impl(request: ChatCompletionRequest, raw_request: Req
273332 return create_error_response (HTTPStatus .BAD_REQUEST , "stream api only support n = 1" )
274333
275334 parser_dict = {}
335+ reasoning_parser_dict = {}
276336
277337 # Streaming case
278338 async def stream_results () -> AsyncGenerator [bytes , None ]:
@@ -284,12 +344,31 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
284344 async for sub_req_id , request_output , metadata , finish_status in results_generator :
285345 prompt_tokens = metadata ["prompt_tokens" ]
286346 completion_tokens += 1
287- if request .tool_choice != "none" and request .tools :
288- delta = request_output
289- group_request_id = convert_sub_id_to_group_id (sub_req_id )
290- index = sub_req_id
291- finish_reason = finish_status .get_finish_reason ()
347+ group_request_id = convert_sub_id_to_group_id (sub_req_id )
348+ index = sub_req_id
349+ delta = request_output
350+ finish_reason = finish_status .get_finish_reason ()
351+
352+ # Handle reasoning content
353+ if get_env_start_args ().reasoning_parser and request .separate_reasoning :
354+ reasoning_text , delta = _process_reasoning_stream (
355+ index , delta , reasoning_parser_dict , request_output , request
356+ )
357+ if reasoning_text :
358+ choice_data = ChatCompletionStreamResponseChoice (
359+ index = 0 ,
360+ delta = DeltaMessage (reasoning_content = reasoning_text ),
361+ finish_reason = None ,
362+ )
363+ chunk = ChatCompletionStreamResponse (
364+ id = group_request_id ,
365+ created = created_time ,
366+ choices = [choice_data ],
367+ model = request .model ,
368+ )
369+ yield f"data: { chunk .model_dump_json ()} \n \n "
292370
371+ if request .tool_choice != "none" and request .tools :
293372 if index not in parser_dict :
294373 # 为 tool_call_parser 提供默认值
295374 tool_parser = getattr (g_objs .args , "tool_call_parser" , None ) or "llama3"
@@ -368,7 +447,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
368447 else :
369448 group_request_id = convert_sub_id_to_group_id (sub_req_id )
370449
371- delta_message = DeltaMessage (role = "assistant" , content = request_output )
450+ delta_message = DeltaMessage (role = "assistant" , content = delta )
372451 if finish_status .is_finished ():
373452 finish_reason = finish_status .get_finish_reason ()
374453 stream_choice = ChatCompletionStreamResponseChoice (
0 commit comments