From 722b93398520b7908c94bd069a1f56aba25c7d4b Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Thu, 25 Sep 2025 22:54:37 +0800 Subject: [PATCH 1/2] part1 delete impl --- .../model_executor/pre_and_post_process.py | 42 ---------------- fastdeploy/worker/gpu_model_runner.py | 49 ------------------- 2 files changed, 91 deletions(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index f634537840..34685b5201 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -194,48 +194,6 @@ def post_process_normal( zmq_client: ZmqIpcClient = None, ) -> ModelRunnerOutput: """Post-processing steps after completing a single token generation.""" - # handle vl: - if model_output.think_end_id != -1: - thinking_mask = model_output.enable_thinking - exists_think_end = (sampler_output.sampled_token_ids == model_output.think_end_id) & thinking_mask - paddle.assign( - paddle.where( - exists_think_end, - model_output.need_think_end - 1, - model_output.need_think_end, - ), - model_output.need_think_end, - ) - - reasoning_index_update_cond = model_output.need_think_end.cast("bool") & thinking_mask - paddle.assign( - paddle.where( - reasoning_index_update_cond, - model_output.reasoning_index - 1, - model_output.reasoning_index, - ), - model_output.reasoning_index, - ) - - stop_wo_think = ( - (sampler_output.sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True) - | (model_output.reasoning_index == 0) - ) & (model_output.need_think_end > 0) - - stop_wo_think = stop_wo_think & thinking_mask - sampler_output.sampled_token_ids = paddle.where( - stop_wo_think, - model_output.think_end_id, - sampler_output.sampled_token_ids, - ) - paddle.assign( - paddle.where( - stop_wo_think, - model_output.need_think_end - 1, - model_output.need_think_end, - ), - model_output.need_think_end, - ) # 1. Set stop value paddle.assign( paddle.where( diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 92eac7ddab..ff5dd45c7c 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -258,13 +258,9 @@ def _init_logits_processor(self, request): elif request.structural_tag is not None: schemata_key = ("structural_tag", request.structural_tag) - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - return ( self.guided_backend.get_logits_processor( schemata_key=schemata_key, - enable_thinking=enable_thinking, ), schemata_key, ) @@ -326,23 +322,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = position_ids, request.get("max_tokens", 2048) ) - if request.get("enable_thinking", False): - # Enable thinking - req_reasoning_max_tokens = request.get("reasoning_max_tokens") - req_max_tokens = request.get("max_tokens") - final_reasoning_tokens = ( - req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens - ) - - self.share_inputs["enable_thinking"][idx : idx + 1] = True - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens - else: - # Disable thinking - self.share_inputs["enable_thinking"][idx : idx + 1] = False - self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 - if isinstance(request.prompt_token_ids, np.ndarray): prompt_token_ids = request.prompt_token_ids.tolist() else: @@ -566,23 +545,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: ) self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 - if request.get("enable_thinking", False): - # Enable thinking - req_reasoning_max_tokens = request.get("reasoning_max_tokens") - req_max_tokens = request.get("max_tokens") - final_reasoning_tokens = ( - req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens - ) - - self.share_inputs["enable_thinking"][idx : idx + 1] = True - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens - else: - # Disable thinking - self.share_inputs["enable_thinking"][idx : idx + 1] = False - self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 - def get_attr_from_request(request, attr, default_value=None): res = request.get(attr, default_value) if res is not None: @@ -878,9 +840,6 @@ def _init_share_inputs(self, max_num_seqs: int): tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) # Initialize thinking related buffers - self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - self.share_inputs["enable_thinking"] = paddle.full(shape=[max_num_seqs, 1], fill_value=False, dtype="bool") - self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") # TODO(gongshaotian): move to models if not self.enable_mm: @@ -1423,10 +1382,6 @@ def _dummy_run( ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=self.share_inputs["enable_thinking"], - think_end_id=self.model_config.think_end_id, - need_think_end=self.share_inputs["need_think_end"], - reasoning_index=self.share_inputs["reasoning_index"], stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) @@ -1739,10 +1694,6 @@ class at the server level, which is too granular for ModelRunner. ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=self.share_inputs["enable_thinking"], - think_end_id=self.model_config.think_end_id, - need_think_end=self.share_inputs["need_think_end"][:num_running_requests], - reasoning_index=self.share_inputs["reasoning_index"][:num_running_requests], stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) From 3bad98a2005c4a3101eba27b81d15ea38d7b84b8 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Thu, 25 Sep 2025 23:01:15 +0800 Subject: [PATCH 2/2] part2 delete impl --- fastdeploy/worker/gcu_model_runner.py | 8 ---- fastdeploy/worker/metax_model_runner.py | 21 ---------- fastdeploy/worker/output.py | 20 --------- fastdeploy/worker/xpu_model_runner.py | 56 ------------------------- 4 files changed, 105 deletions(-) diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 332659118f..30172aa6a0 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -837,10 +837,6 @@ def _dummy_run( ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), ) post_process( @@ -1065,10 +1061,6 @@ class at the server level, which is too granular for ModelRunner. ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), ) if self.speculative_config.method in ["mtp"] and self.scheduler_config.splitwise_role == "prefill": diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index 93368b2a47..b7cdca4573 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -242,11 +242,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = else: position_ids = None - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( position_ids, request.get("max_tokens", 2048) ) @@ -459,11 +454,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: self.share_inputs["prompt_lens"][idx : idx + 1] = length if self.enable_mm: - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( position_ids, request.get("max_tokens", 2048) ) @@ -779,9 +769,6 @@ def _init_share_inputs(self, max_num_seqs: int): dtype="float32", ) self.share_inputs["image_features"] = None - self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool") - self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") def _prepare_inputs(self) -> None: """Prepare the model inputs""" @@ -1133,10 +1120,6 @@ def _dummy_run( ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) @@ -1401,10 +1384,6 @@ class at the server level, which is too granular for ModelRunner. ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"][:num_running_requests] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"][:num_running_requests] if self.enable_mm else None), stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) diff --git a/fastdeploy/worker/output.py b/fastdeploy/worker/output.py index 6d820a873a..9b11062859 100644 --- a/fastdeploy/worker/output.py +++ b/fastdeploy/worker/output.py @@ -220,26 +220,6 @@ class ModelOutputData: """ accept_num: paddle.Tensor - """ - vl model enable to think - """ - enable_thinking: paddle.Tensor = None - - """ - vl model think end id - """ - think_end_id: int = -1 - - """ - vl model need to think - """ - need_think_end: paddle.Tensor = None - - """ - vl model reasoning index - """ - reasoning_index: paddle.Tensor = None - """ the token ids of stop sequence """ diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index b5fa856ce1..99ff708c0c 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -203,45 +203,6 @@ def xpu_post_process( update_inputs, ) - # handle vl: - if model_output.enable_thinking: - exists_think_end = sampled_token_ids == model_output.think_end_id - paddle.assign( - paddle.where( - exists_think_end, - model_output.need_think_end - 1, - model_output.need_think_end, - ), - model_output.need_think_end, - ) - - paddle.assign( - paddle.where( - model_output.need_think_end.cast("bool"), - model_output.reasoning_index - 1, - model_output.reasoning_index, - ), - model_output.reasoning_index, - ) - - stop_wo_think = ( - (sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True) - | (model_output.reasoning_index == 0) - ) & (model_output.need_think_end > 0) - sampled_token_ids = paddle.where( - stop_wo_think, - model_output.think_end_id, - sampled_token_ids, - ) - paddle.assign( - paddle.where( - stop_wo_think, - model_output.need_think_end - 1, - model_output.need_think_end, - ), - model_output.need_think_end, - ) - # 1. Set stop value paddle.assign( paddle.where( @@ -499,11 +460,6 @@ def insert_tasks_v1(self, req_dicts: List[Request]): else: position_ids = None - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( position_ids, request.get("max_tokens", 2048) ) @@ -638,11 +594,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request]): self.share_inputs["prompt_lens"][idx : idx + 1] = length if self.enable_mm: - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( position_ids, request.get("max_tokens", 2048) ) @@ -857,9 +808,6 @@ def _init_share_inputs(self, max_num_seqs: int): dtype="float32", ) self.share_inputs["image_features"] = None - self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool") - self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") def _prepare_inputs(self, is_dummy_run=False) -> None: """Prepare the model inputs""" @@ -1159,10 +1107,6 @@ class at the server level, which is too granular for ModelRunner. actual_draft_token_num=None, accept_tokens=None, accept_num=None, - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"][:num_running_requests] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"][:num_running_requests] if self.enable_mm else None), stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], )