Skip to content

Commit 2c92f6f

Browse files
committed
fix
1 parent e49676c commit 2c92f6f

File tree

4 files changed

+44
-60
lines changed

4 files changed

+44
-60
lines changed

fastdeploy/entrypoints/openai/response_processors.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,12 @@ def accumulate_token_ids(self, request_output):
6767
else:
6868
self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})
6969

70-
async def process_response_chat(self, request_outputs, stream, model_status, include_stop_str_in_output):
70+
async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output):
7171
"""
7272
Process a list of responses into a generator that yields each processed response as it's generated.
7373
Args:
7474
request_outputs: The list of outputs to be processed.
7575
stream: Whether or not to stream the output.
76-
model_status: Whether or not to show thinking messages.
7776
include_stop_str_in_output: Whether or not to include stop strings in the output.
7877
"""
7978
for request_output in request_outputs:
@@ -82,7 +81,6 @@ async def process_response_chat(self, request_outputs, stream, model_status, inc
8281
yield self.data_processor.process_response_dict(
8382
response_dict=request_output,
8483
stream=stream,
85-
model_status=model_status,
8684
include_stop_str_in_output=include_stop_str_in_output,
8785
)
8886
elif stream:
@@ -108,7 +106,6 @@ async def process_response_chat(self, request_outputs, stream, model_status, inc
108106
self.data_processor.process_response_dict(
109107
response_dict=request_output,
110108
stream=stream,
111-
model_status=model_status,
112109
include_stop_str_in_output=include_stop_str_in_output,
113110
)
114111
text = {"type": "text", "text": request_output["outputs"]["text"]}
@@ -128,7 +125,6 @@ async def process_response_chat(self, request_outputs, stream, model_status, inc
128125
self.data_processor.process_response_dict(
129126
response_dict=part["request_output"],
130127
stream=False,
131-
model_status=model_status,
132128
include_stop_str_in_output=include_stop_str_in_output,
133129
)
134130
text = {"type": "text", "text": part["request_output"]["outputs"]["text"]}

fastdeploy/input/ernie4_5_processor.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,10 @@ def process_request_dict(self, request, max_model_len=None):
240240
if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
241241
request["enable_thinking"] = True
242242
if self.reasoning_parser:
243-
self.model_status_dict["request_id"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
244-
if self.model_status_dict["request_id"] == "think_start":
243+
self.model_status_dict[request["request_id"]] = self.reasoning_parser.get_model_status(
244+
request["prompt_token_ids"]
245+
)
246+
if self.model_status_dict[request["request_id"]] == "think_start":
245247
request["enable_thinking"] = True
246248
data_processor_logger.info(f"Processed request dict: {request}")
247249
return request
@@ -256,7 +258,6 @@ def process_response(self, response_dict, **kwargs):
256258
Returns:
257259
Dict: response contain text fields
258260
"""
259-
model_status = kwargs.get("model_status")
260261
req_id = response_dict.request_id
261262
token_ids = response_dict.outputs.token_ids
262263

@@ -266,7 +267,7 @@ def process_response(self, response_dict, **kwargs):
266267
full_text = self.tokenizer.decode(token_ids)
267268
if self.reasoning_parser:
268269
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
269-
full_text, response_dict, model_status
270+
full_text, response_dict, self.model_status_dict[req_id]
270271
)
271272
response_dict.outputs.text = text
272273
response_dict.outputs.reasoning_content = reasoning_content
@@ -310,7 +311,6 @@ def process_response_dict_normal(self, response_dict, **kwargs):
310311
Returns:
311312
Dict: response contain text fields
312313
"""
313-
enable_thinking = kwargs.get("enable_thinking")
314314
token_ids = response_dict["outputs"]["token_ids"]
315315
is_end = response_dict["finished"]
316316
req_id = response_dict["request_id"]
@@ -321,11 +321,9 @@ def process_response_dict_normal(self, response_dict, **kwargs):
321321
if is_end:
322322
full_text = previous_texts + delta_text
323323
response_dict["outputs"]["text"] = full_text
324-
if self.reasoning_parser and (
325-
enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
326-
):
324+
if self.reasoning_parser:
327325
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
328-
full_text, response_dict, self.model_status_dict.get(req_id)
326+
full_text, response_dict, self.model_status_dict[req_id]
329327
)
330328
response_dict["outputs"]["text"] = text
331329
response_dict["outputs"]["reasoning_content"] = reasoning_content
@@ -352,7 +350,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
352350
Returns:
353351
Dict: response contain text fields
354352
"""
355-
enable_thinking = kwargs.get("enable_thinking")
356353
is_end = response_dict["finished"]
357354
req_id = response_dict["request_id"]
358355
token_ids = response_dict["outputs"]["token_ids"]
@@ -362,17 +359,15 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
362359
token_ids = token_ids[:-1]
363360
delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
364361
response_dict["outputs"]["raw_prediction"] = delta_text
365-
if self.reasoning_parser and (
366-
enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
367-
):
362+
if self.reasoning_parser:
368363
reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
369364
previous_texts,
370365
previous_texts + delta_text,
371366
delta_text,
372367
previous_token_ids,
373368
previous_token_ids + token_ids,
374369
token_ids,
375-
self.model_status_dict.get(req_id),
370+
self.model_status_dict[req_id],
376371
)
377372
response_dict["outputs"]["delta_message"] = reasoning_delta_message
378373
if self.tool_parser_obj:

fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def __init__(
5454

5555
self.tool_parser_dict = dict()
5656
self.decode_status = dict()
57+
self.model_status_dict = dict()
5758
self._load_tokenizer()
5859

5960
# Generation config
@@ -255,8 +256,12 @@ def process_request_dict(self, request, max_model_len=None):
255256
request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))
256257
data_processor_logger.info(f"Processed request {request}")
257258

258-
if self.reasoning_parser is not None:
259-
request["model_status"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
259+
if self.reasoning_parser:
260+
self.model_status_dict[request.request_id] = self.reasoning_parser.get_model_status(
261+
request.prompt_token_ids
262+
)
263+
if self.model_status_dict[request.request_id] == "think_start":
264+
request.enable_thinking = True
260265

261266
return request
262267

@@ -290,21 +295,3 @@ def pack_outputs(self, outs):
290295
outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64)
291296

292297
return outs
293-
294-
def process_response_dict(self, response_dict, stream, **kwargs):
295-
"""
296-
Preprocess the response
297-
298-
Args:
299-
response_dict (Dict): response for engine, contain ids fields
300-
301-
Returns:
302-
Dict: response contain text fields
303-
"""
304-
enable_thinking = kwargs.pop("enable_thinking", True)
305-
if enable_thinking is None:
306-
enable_thinking = True
307-
if stream:
308-
return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)
309-
else:
310-
return self.process_response_dict_normal(response_dict, enable_thinking=enable_thinking, **kwargs)

fastdeploy/input/text_processor.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob
175175
self.generation_config = None
176176

177177
self.decode_status = dict()
178+
self.model_status_dict = dict()
178179
self.tool_parser_dict = dict()
179180
self.tokenizer = self._load_tokenizer()
180181
data_processor_logger.info(
@@ -266,8 +267,10 @@ def process_request(self, request, max_model_len=None, **kwargs):
266267
if request.get("top_p") < _SAMPLING_EPS:
267268
request.set("top_p", _SAMPLING_EPS)
268269
if self.reasoning_parser:
269-
request.model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
270-
if request.model_status == "think_start":
270+
self.model_status_dict[request.request_id] = self.reasoning_parser.get_model_status(
271+
request.prompt_token_ids
272+
)
273+
if self.model_status_dict[request.request_id] == "think_start":
271274
request.enable_thinking = True
272275

273276
data_processor_logger.info(f"Processed request: {request}")
@@ -343,6 +346,12 @@ def process_request_dict(self, request, max_model_len=None, **kwargs):
343346
request["temperature"] = 1
344347
if request.get("top_p") < _SAMPLING_EPS:
345348
request["top_p"] = _SAMPLING_EPS
349+
if self.reasoning_parser:
350+
self.model_status_dict[request["request_id"]] = self.reasoning_parser.get_model_status(
351+
request["prompt_token_ids"]
352+
)
353+
if self.model_status_dict[request["request_id"]] == "think_start":
354+
request["enable_thinking"] = True
346355

347356
data_processor_logger.info(f"Processed request dict: {request}")
348357
return request
@@ -366,21 +375,22 @@ def process_response(self, response_dict, **kwargs):
366375
if token_ids[-1] == self.tokenizer.eos_token_id:
367376
token_ids = token_ids[:-1]
368377
full_text = self.tokenizer.decode(token_ids)
369-
378+
response_dict.outputs.text = full_text
370379
# 模型支持思考,并且支持思考
371380
if self.reasoning_parser:
372-
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
381+
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
382+
full_text, response_dict, self.model_status_dict[req_id]
383+
)
373384
response_dict.outputs.text = text
374385
response_dict.outputs.reasoning_content = reasoning_content
375-
else:
376-
# 模型不支持思考,并且没单独设置enable_thinking为false
377-
response_dict.outputs.text = full_text
378386
if self.tool_parser_obj:
379387
tool_parser = self.tool_parser_obj(self.tokenizer)
380388
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
381389
if tool_call_info.tools_called:
382390
response_dict.outputs.tool_calls = tool_call_info.tool_calls
383391
response_dict.outputs.text = tool_call_info.content
392+
if req_id in self.model_status_dict:
393+
del self.model_status_dict[req_id]
384394
data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
385395

386396
return response_dict
@@ -395,7 +405,6 @@ def process_response_dict_normal(self, response_dict, **kwargs):
395405
Returns:
396406
Dict: response contain text fields
397407
"""
398-
enable_thinking = kwargs.get("enable_thinking")
399408
token_ids = response_dict["outputs"]["token_ids"]
400409
is_end = response_dict["finished"]
401410
req_id = response_dict["request_id"]
@@ -406,12 +415,13 @@ def process_response_dict_normal(self, response_dict, **kwargs):
406415
if is_end:
407416
full_text = previous_texts + delta_text
408417
response_dict["outputs"]["raw_prediction"] = full_text
409-
if enable_thinking and self.reasoning_parser:
410-
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
418+
response_dict["outputs"]["text"] = full_text
419+
if self.reasoning_parser:
420+
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
421+
full_text, response_dict, self.model_status_dict[req_id]
422+
)
411423
response_dict["outputs"]["text"] = text
412424
response_dict["outputs"]["reasoning_content"] = reasoning_content
413-
else:
414-
response_dict["outputs"]["text"] = full_text
415425
if self.tool_parser_obj:
416426
tool_parser = self.tool_parser_obj(self.tokenizer)
417427
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
@@ -432,7 +442,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
432442
Returns:
433443
Dict: response contain text fields
434444
"""
435-
enable_thinking = kwargs.get("enable_thinking")
436445
is_end = response_dict["finished"]
437446
req_id = response_dict["request_id"]
438447
token_ids = response_dict["outputs"]["token_ids"]
@@ -442,16 +451,15 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
442451
token_ids = token_ids[:-1]
443452
delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
444453
response_dict["outputs"]["raw_prediction"] = delta_text
445-
if self.reasoning_parser and (
446-
enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
447-
):
454+
if self.reasoning_parser:
448455
reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
449456
previous_texts,
450457
previous_texts + delta_text,
451458
delta_text,
452459
previous_token_ids,
453460
previous_token_ids + token_ids,
454461
token_ids,
462+
self.model_status_dict[req_id],
455463
)
456464
response_dict["outputs"]["delta_message"] = reasoning_delta_message
457465
if self.tool_parser_obj:
@@ -475,6 +483,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
475483
del self.decode_status[req_id]
476484
if req_id in self.tool_parser_dict:
477485
del self.tool_parser_dict[req_id]
486+
if req_id in self.model_status_dict:
487+
del self.model_status_dict[req_id]
478488
return response_dict
479489

480490
def process_response_dict(self, response_dict, **kwargs):
@@ -487,16 +497,12 @@ def process_response_dict(self, response_dict, **kwargs):
487497
Returns:
488498
Dict: response contain text fields
489499
"""
490-
enable_thinking = kwargs.pop("enable_thinking", True)
491-
if enable_thinking is None:
492-
enable_thinking = True
493500
stream = kwargs.get("stream", True)
494501
if stream:
495-
return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)
502+
return self.process_response_dict_streaming(response_dict, **kwargs)
496503
else:
497504
return self.process_response_dict_normal(
498505
response_dict=response_dict,
499-
enable_thinking=enable_thinking,
500506
**kwargs,
501507
)
502508

0 commit comments

Comments
 (0)