Skip to content

Commit e49676c

Browse files
committed
fix
1 parent dae8419 commit e49676c

File tree

6 files changed

+67
-40
lines changed

6 files changed

+67
-40
lines changed

fastdeploy/engine/request.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ def __init__(
7171
guided_grammar: Optional[Any] = None,
7272
structural_tag: Optional[Any] = None,
7373
guided_json_object: Optional[bool] = None,
74-
enable_thinking: Optional[bool] = True,
74+
enable_thinking: Optional[bool] = False,
75+
model_status: Optional[str] = None,
7576
trace_carrier: dict = dict(),
7677
dp_rank: Optional[int] = None,
7778
chat_template: Optional[str] = None,

fastdeploy/entrypoints/openai/serving_chat.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
120120
text_after_process = current_req_dict.get("text_after_process")
121121
if isinstance(prompt_token_ids, np.ndarray):
122122
prompt_token_ids = prompt_token_ids.tolist()
123-
model_status = current_req_dict.get("model_status")
124123
except ParameterError as e:
125124
api_server_logger.error(f"request[{request_id}] generator error: {str(e)}, {e.message}")
126125
self.engine_client.semaphore.release()
@@ -136,12 +135,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
136135

137136
if request.stream:
138137
return self.chat_completion_stream_generator(
139-
request, request_id, request.model, prompt_token_ids, text_after_process, model_status
138+
request, request_id, request.model, prompt_token_ids, text_after_process
140139
)
141140
else:
142141
try:
143142
return await self.chat_completion_full_generator(
144-
request, request_id, request.model, prompt_token_ids, text_after_process, model_status
143+
request, request_id, request.model, prompt_token_ids, text_after_process
145144
)
146145
except Exception as e:
147146
error_msg = f"request[{request_id}]full generator error: {str(e)}, {str(traceback.format_exc())}"
@@ -169,7 +168,6 @@ async def chat_completion_stream_generator(
169168
model_name: str,
170169
prompt_token_ids: list(),
171170
text_after_process: str,
172-
model_status: str,
173171
):
174172
"""
175173
Streaming chat completion generator.
@@ -240,7 +238,6 @@ async def chat_completion_stream_generator(
240238
generator = response_processor.process_response_chat(
241239
response,
242240
stream=True,
243-
model_status=model_status,
244241
include_stop_str_in_output=include_stop_str_in_output,
245242
)
246243

@@ -410,7 +407,6 @@ async def chat_completion_full_generator(
410407
model_name: str,
411408
prompt_token_ids: list(),
412409
text_after_process: str,
413-
model_status: str,
414410
):
415411
"""
416412
Full chat completion generator.
@@ -460,7 +456,6 @@ async def chat_completion_full_generator(
460456
generator = response_processor.process_response_chat(
461457
response,
462458
stream=False,
463-
model_status=model_status,
464459
include_stop_str_in_output=include_stop_str_in_output,
465460
)
466461
async for data in generator:

fastdeploy/input/ernie4_5_processor.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob
6060
self.decode_status = dict()
6161
self.tool_parser_dict = dict()
6262
self.thinking_parser_dict = dict()
63+
self.model_status_dict = dict()
6364
self._load_tokenizer()
6465
data_processor_logger.info(
6566
f"tokenizer information: bos_token is {self.tokenizer.bos_token} \
@@ -154,6 +155,12 @@ def process_request(self, request, max_model_len=None, **kwargs):
154155
request.set("top_p", _SAMPLING_EPS)
155156
if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
156157
request.enable_thinking = True
158+
if self.reasoning_parser:
159+
self.model_status_dict[request.request_id] = self.reasoning_parser.get_model_status(
160+
request.prompt_token_ids
161+
)
162+
if self.model_status_dict[request.request_id] == "think_start":
163+
request.enable_thinking = True
157164

158165
data_processor_logger.info(f"Processed request: {request}")
159166
return request
@@ -233,8 +240,8 @@ def process_request_dict(self, request, max_model_len=None):
233240
if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
234241
request["enable_thinking"] = True
235242
if self.reasoning_parser:
236-
request["model_status"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
237-
if request["model_status"] == "think_start":
243+
self.model_status_dict["request_id"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
244+
if self.model_status_dict["request_id"] == "think_start":
238245
request["enable_thinking"] = True
239246
data_processor_logger.info(f"Processed request dict: {request}")
240247
return request
@@ -274,6 +281,8 @@ def process_response(self, response_dict, **kwargs):
274281
data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
275282
if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
276283
return None
284+
if req_id in self.model_status_dict:
285+
del self.model_status_dict[req_id]
277286
return response_dict
278287

279288
def process_response_dict(self, response_dict, stream, **kwargs):
@@ -302,7 +311,6 @@ def process_response_dict_normal(self, response_dict, **kwargs):
302311
Dict: response contain text fields
303312
"""
304313
enable_thinking = kwargs.get("enable_thinking")
305-
model_status = kwargs.get("model_status")
306314
token_ids = response_dict["outputs"]["token_ids"]
307315
is_end = response_dict["finished"]
308316
req_id = response_dict["request_id"]
@@ -317,7 +325,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
317325
enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
318326
):
319327
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
320-
full_text, response_dict, model_status
328+
full_text, response_dict, self.model_status_dict.get(req_id)
321329
)
322330
response_dict["outputs"]["text"] = text
323331
response_dict["outputs"]["reasoning_content"] = reasoning_content
@@ -330,6 +338,8 @@ def process_response_dict_normal(self, response_dict, **kwargs):
330338
response_dict["outputs"]["raw_prediction"] = full_text
331339
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
332340
del self.decode_status[req_id]
341+
if req_id in self.model_status_dict:
342+
del self.model_status_dict[req_id]
333343
return response_dict
334344

335345
def process_response_dict_streaming(self, response_dict, **kwargs):
@@ -343,7 +353,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
343353
Dict: response contain text fields
344354
"""
345355
enable_thinking = kwargs.get("enable_thinking")
346-
model_status = kwargs.get("model_status")
347356
is_end = response_dict["finished"]
348357
req_id = response_dict["request_id"]
349358
token_ids = response_dict["outputs"]["token_ids"]
@@ -363,7 +372,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
363372
previous_token_ids,
364373
previous_token_ids + token_ids,
365374
token_ids,
366-
model_status,
375+
self.model_status_dict.get(req_id),
367376
)
368377
response_dict["outputs"]["delta_message"] = reasoning_delta_message
369378
if self.tool_parser_obj:
@@ -387,6 +396,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
387396
del self.decode_status[req_id]
388397
if req_id in self.tool_parser_dict:
389398
del self.tool_parser_dict[req_id]
399+
if req_id in self.model_status_dict:
400+
del self.model_status_dict[req_id]
390401
return response_dict
391402

392403
def messages2ids(self, request_or_messages):

fastdeploy/input/text_processor.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,10 @@ def process_request(self, request, max_model_len=None, **kwargs):
265265
request.set("temperature", 1)
266266
if request.get("top_p") < _SAMPLING_EPS:
267267
request.set("top_p", _SAMPLING_EPS)
268+
if self.reasoning_parser:
269+
request.model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
270+
if request.model_status == "think_start":
271+
request.enable_thinking = True
268272

269273
data_processor_logger.info(f"Processed request: {request}")
270274
return request

fastdeploy/reasoning/ernie_vl_reasoning_parsers.py

Lines changed: 37 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -35,38 +35,47 @@ class ErnieVLReasoningParser(ReasoningParser):
3535

3636
def __init__(self, tokenizer):
3737
super().__init__(tokenizer)
38-
self.think_start_token = "</think>"
39-
self.think_end_token = "</think>"
38+
token_definitions = {
39+
"think_start_token": "<think>",
40+
"think_end_token": "</think>",
41+
}
4042

4143
if not self.model_tokenizer:
42-
raise ValueError(
43-
"The model tokenizer must be passed to the ReasoningParser " "constructor during construction."
44+
raise ValueError("The model tokenizer must be passed to the ReasoningParser constructor.")
45+
46+
missing_tokens = []
47+
for name, token_value in token_definitions.items():
48+
setattr(self, name, token_value)
49+
token_id = self.vocab.get(token_value)
50+
setattr(self, f"{name}_id", token_id)
51+
if token_id is None:
52+
missing_tokens.append(f"{name.replace('_', ' ')} token")
53+
54+
if missing_tokens:
55+
raise RuntimeError(
56+
f"Could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}"
4457
)
45-
46-
self.think_end_token_id = self.vocab.get(self.think_end_token)
47-
if self.think_end_token_id is None:
48-
raise RuntimeError("Ernie VL reasoning parser could not locate think end " "tokens in the tokenizer!")
49-
self.think_start_token_id = self.vocab.get(self.think_start_token)
58+
self.token_status_mapping = {
59+
self.think_start_token_id: "think_start",
60+
self.think_end_token_id: "think_end",
61+
}
5062

5163
def is_reasoning_end(self, input_ids: list[int]) -> bool:
5264
return self.think_end_token_id in input_ids
5365

5466
def find_last_special_token(self, prompt_token_ids: list[int]) -> int:
5567
for i in range(len(prompt_token_ids) - 1, -1, -1):
56-
if prompt_token_ids[i] in [self.think_end_token_id, self.think_start_token_id]:
68+
if prompt_token_ids[i] in self.token_status_mapping:
5769
return prompt_token_ids[i]
5870
return -1
5971

6072
def get_model_status(self, prompt_token_ids: list[int]):
6173
special_token_id = self.find_last_special_token(prompt_token_ids)
74+
6275
if special_token_id == -1:
63-
return "responding"
64-
if special_token_id == self.think_end_token_id:
65-
return "responding"
66-
if self.think_start_token_id == special_token_id:
67-
return "thinking"
76+
return "think_start"
6877

69-
return "responding"
78+
return self.token_status_mapping[special_token_id]
7079

7180
def extract_reasoning_content_streaming(
7281
self,
@@ -89,15 +98,18 @@ def extract_reasoning_content_streaming(
8998
# Skip single special tokens
9099
if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id:
91100
return None
92-
if self.think_end_token_id in delta_token_ids:
93-
end_index = delta_text.find(self.end_token)
94-
reasoning_content = delta_text[:end_index]
95-
content = delta_text[end_index + len(self.end_token) :]
96-
return DeltaMessage(reasoning_content=reasoning_content, content=content)
97-
elif self.think_end_token_id in previous_token_ids:
98-
return DeltaMessage(content=delta_text)
101+
if model_status == "think_start":
102+
if self.think_end_token_id in delta_token_ids:
103+
end_index = delta_text.find(self.end_token)
104+
reasoning_content = delta_text[:end_index]
105+
content = delta_text[end_index + len(self.end_token) :]
106+
return DeltaMessage(reasoning_content=reasoning_content, content=content)
107+
elif self.think_end_token_id in previous_token_ids:
108+
return DeltaMessage(content=delta_text)
109+
else:
110+
return DeltaMessage(reasoning_content=delta_text)
99111
else:
100-
return DeltaMessage(reasoning_content=delta_text)
112+
return DeltaMessage(content=delta_text)
101113

102114
def extract_reasoning_content(
103115
self,
@@ -117,7 +129,7 @@ def extract_reasoning_content(
117129
"""
118130

119131
# Check if the model output contains the </think> tokens.
120-
if model_status == "thinking":
132+
if model_status == "think_start":
121133
if self.think_end_token not in model_output:
122134
return model_output, ""
123135
reasoning_content, _, content = model_output.partition(self.think_end_token)

fastdeploy/reasoning/qwen3_reasoning_parsers.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ def __init__(self, tokenizer):
5151
def is_reasoning_end(self, input_ids: list[int]) -> bool:
5252
return self.think_end_token_id in input_ids
5353

54+
def get_model_status(self, prompt_token_ids: list[int]):
55+
return "think_start"
56+
5457
def extract_reasoning_content_streaming(
5558
self,
5659
previous_text: str,
@@ -59,6 +62,7 @@ def extract_reasoning_content_streaming(
5962
previous_token_ids: Sequence[int],
6063
current_token_ids: Sequence[int],
6164
delta_token_ids: Sequence[int],
65+
model_status: str,
6266
) -> Union[DeltaMessage, None]:
6367
"""
6468
Extract reasoning content from a delta message.
@@ -103,7 +107,7 @@ def extract_reasoning_content_streaming(
103107
return DeltaMessage(reasoning_content=delta_text)
104108

105109
def extract_reasoning_content(
106-
self, model_output: str, request: ChatCompletionRequest
110+
self, model_output: str, request: ChatCompletionRequest, model_status: str
107111
) -> tuple[Optional[str], Optional[str]]:
108112
"""
109113
Extract reasoning content from the model output.

0 commit comments

Comments
 (0)