Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 0 additions & 42 deletions fastdeploy/model_executor/pre_and_post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,48 +194,6 @@ def post_process_normal(
zmq_client: ZmqIpcClient = None,
) -> ModelRunnerOutput:
"""Post-processing steps after completing a single token generation."""
# handle vl:
if model_output.think_end_id != -1:
thinking_mask = model_output.enable_thinking
exists_think_end = (sampler_output.sampled_token_ids == model_output.think_end_id) & thinking_mask
paddle.assign(
paddle.where(
exists_think_end,
model_output.need_think_end - 1,
model_output.need_think_end,
),
model_output.need_think_end,
)

reasoning_index_update_cond = model_output.need_think_end.cast("bool") & thinking_mask
paddle.assign(
paddle.where(
reasoning_index_update_cond,
model_output.reasoning_index - 1,
model_output.reasoning_index,
),
model_output.reasoning_index,
)

stop_wo_think = (
(sampler_output.sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True)
| (model_output.reasoning_index == 0)
) & (model_output.need_think_end > 0)

stop_wo_think = stop_wo_think & thinking_mask
sampler_output.sampled_token_ids = paddle.where(
stop_wo_think,
model_output.think_end_id,
sampler_output.sampled_token_ids,
)
paddle.assign(
paddle.where(
stop_wo_think,
model_output.need_think_end - 1,
model_output.need_think_end,
),
model_output.need_think_end,
)
# 1. Set stop value
paddle.assign(
paddle.where(
Expand Down
8 changes: 0 additions & 8 deletions fastdeploy/worker/gcu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,10 +837,6 @@ def _dummy_run(
),
accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None),
)

post_process(
Expand Down Expand Up @@ -1065,10 +1061,6 @@ class at the server level, which is too granular for ModelRunner.
),
accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None),
)

if self.speculative_config.method in ["mtp"] and self.scheduler_config.splitwise_role == "prefill":
Expand Down
49 changes: 0 additions & 49 deletions fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,13 +258,9 @@ def _init_logits_processor(self, request):
elif request.structural_tag is not None:
schemata_key = ("structural_tag", request.structural_tag)

enable_thinking = request.get("enable_thinking", True)
enable_thinking = enable_thinking if enable_thinking is not None else True

return (
self.guided_backend.get_logits_processor(
schemata_key=schemata_key,
enable_thinking=enable_thinking,
),
schemata_key,
)
Expand Down Expand Up @@ -326,23 +322,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
position_ids, request.get("max_tokens", 2048)
)

if request.get("enable_thinking", False):
# Enable thinking
req_reasoning_max_tokens = request.get("reasoning_max_tokens")
req_max_tokens = request.get("max_tokens")
final_reasoning_tokens = (
req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens
)

self.share_inputs["enable_thinking"][idx : idx + 1] = True
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1
self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens
else:
# Disable thinking
self.share_inputs["enable_thinking"][idx : idx + 1] = False
self.share_inputs["need_think_end"][idx : idx + 1, :] = 0
self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0

if isinstance(request.prompt_token_ids, np.ndarray):
prompt_token_ids = request.prompt_token_ids.tolist()
else:
Expand Down Expand Up @@ -566,23 +545,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
)
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0

if request.get("enable_thinking", False):
# Enable thinking
req_reasoning_max_tokens = request.get("reasoning_max_tokens")
req_max_tokens = request.get("max_tokens")
final_reasoning_tokens = (
req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens
)

self.share_inputs["enable_thinking"][idx : idx + 1] = True
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1
self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens
else:
# Disable thinking
self.share_inputs["enable_thinking"][idx : idx + 1] = False
self.share_inputs["need_think_end"][idx : idx + 1, :] = 0
self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0

def get_attr_from_request(request, attr, default_value=None):
res = request.get(attr, default_value)
if res is not None:
Expand Down Expand Up @@ -878,9 +840,6 @@ def _init_share_inputs(self, max_num_seqs: int):
tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))

# Initialize thinking related buffers
self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
self.share_inputs["enable_thinking"] = paddle.full(shape=[max_num_seqs, 1], fill_value=False, dtype="bool")
self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")

# TODO(gongshaotian): move to models
if not self.enable_mm:
Expand Down Expand Up @@ -1423,10 +1382,6 @@ def _dummy_run(
),
accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
enable_thinking=self.share_inputs["enable_thinking"],
think_end_id=self.model_config.think_end_id,
need_think_end=self.share_inputs["need_think_end"],
reasoning_index=self.share_inputs["reasoning_index"],
stop_token_ids=self.share_inputs["stop_seqs"],
stop_seqs_len=self.share_inputs["stop_seqs_len"],
)
Expand Down Expand Up @@ -1739,10 +1694,6 @@ class at the server level, which is too granular for ModelRunner.
),
accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
enable_thinking=self.share_inputs["enable_thinking"],
think_end_id=self.model_config.think_end_id,
need_think_end=self.share_inputs["need_think_end"][:num_running_requests],
reasoning_index=self.share_inputs["reasoning_index"][:num_running_requests],
stop_token_ids=self.share_inputs["stop_seqs"],
stop_seqs_len=self.share_inputs["stop_seqs_len"],
)
Expand Down
21 changes: 0 additions & 21 deletions fastdeploy/worker/metax_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,11 +242,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
else:
position_ids = None

enable_thinking = request.get("enable_thinking", True)
enable_thinking = enable_thinking if enable_thinking is not None else True
self.share_inputs["enable_thinking"][:] = enable_thinking
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
position_ids, request.get("max_tokens", 2048)
)
Expand Down Expand Up @@ -459,11 +454,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
self.share_inputs["prompt_lens"][idx : idx + 1] = length

if self.enable_mm:
enable_thinking = request.get("enable_thinking", True)
enable_thinking = enable_thinking if enable_thinking is not None else True
self.share_inputs["enable_thinking"][:] = enable_thinking
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
position_ids, request.get("max_tokens", 2048)
)
Expand Down Expand Up @@ -779,9 +769,6 @@ def _init_share_inputs(self, max_num_seqs: int):
dtype="float32",
)
self.share_inputs["image_features"] = None
self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool")
self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")

def _prepare_inputs(self) -> None:
"""Prepare the model inputs"""
Expand Down Expand Up @@ -1133,10 +1120,6 @@ def _dummy_run(
),
accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None),
stop_token_ids=self.share_inputs["stop_seqs"],
stop_seqs_len=self.share_inputs["stop_seqs_len"],
)
Expand Down Expand Up @@ -1401,10 +1384,6 @@ class at the server level, which is too granular for ModelRunner.
),
accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
need_think_end=(self.share_inputs["need_think_end"][:num_running_requests] if self.enable_mm else None),
reasoning_index=(self.share_inputs["reasoning_index"][:num_running_requests] if self.enable_mm else None),
stop_token_ids=self.share_inputs["stop_seqs"],
stop_seqs_len=self.share_inputs["stop_seqs_len"],
)
Expand Down
20 changes: 0 additions & 20 deletions fastdeploy/worker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,26 +220,6 @@ class ModelOutputData:
"""
accept_num: paddle.Tensor

"""
vl model enable to think
"""
enable_thinking: paddle.Tensor = None

"""
vl model think end id
"""
think_end_id: int = -1

"""
vl model need to think
"""
need_think_end: paddle.Tensor = None

"""
vl model reasoning index
"""
reasoning_index: paddle.Tensor = None

"""
the token ids of stop sequence
"""
Expand Down
56 changes: 0 additions & 56 deletions fastdeploy/worker/xpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,45 +203,6 @@ def xpu_post_process(
update_inputs,
)

# handle vl:
if model_output.enable_thinking:
exists_think_end = sampled_token_ids == model_output.think_end_id
paddle.assign(
paddle.where(
exists_think_end,
model_output.need_think_end - 1,
model_output.need_think_end,
),
model_output.need_think_end,
)

paddle.assign(
paddle.where(
model_output.need_think_end.cast("bool"),
model_output.reasoning_index - 1,
model_output.reasoning_index,
),
model_output.reasoning_index,
)

stop_wo_think = (
(sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True)
| (model_output.reasoning_index == 0)
) & (model_output.need_think_end > 0)
sampled_token_ids = paddle.where(
stop_wo_think,
model_output.think_end_id,
sampled_token_ids,
)
paddle.assign(
paddle.where(
stop_wo_think,
model_output.need_think_end - 1,
model_output.need_think_end,
),
model_output.need_think_end,
)

# 1. Set stop value
paddle.assign(
paddle.where(
Expand Down Expand Up @@ -499,11 +460,6 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
else:
position_ids = None

enable_thinking = request.get("enable_thinking", True)
enable_thinking = enable_thinking if enable_thinking is not None else True
self.share_inputs["enable_thinking"][:] = enable_thinking
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
position_ids, request.get("max_tokens", 2048)
)
Expand Down Expand Up @@ -638,11 +594,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request]):
self.share_inputs["prompt_lens"][idx : idx + 1] = length

if self.enable_mm:
enable_thinking = request.get("enable_thinking", True)
enable_thinking = enable_thinking if enable_thinking is not None else True
self.share_inputs["enable_thinking"][:] = enable_thinking
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
position_ids, request.get("max_tokens", 2048)
)
Expand Down Expand Up @@ -857,9 +808,6 @@ def _init_share_inputs(self, max_num_seqs: int):
dtype="float32",
)
self.share_inputs["image_features"] = None
self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool")
self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")

def _prepare_inputs(self, is_dummy_run=False) -> None:
"""Prepare the model inputs"""
Expand Down Expand Up @@ -1159,10 +1107,6 @@ class at the server level, which is too granular for ModelRunner.
actual_draft_token_num=None,
accept_tokens=None,
accept_num=None,
enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
need_think_end=(self.share_inputs["need_think_end"][:num_running_requests] if self.enable_mm else None),
reasoning_index=(self.share_inputs["reasoning_index"][:num_running_requests] if self.enable_mm else None),
stop_token_ids=self.share_inputs["stop_seqs"],
stop_seqs_len=self.share_inputs["stop_seqs_len"],
)
Expand Down
Loading