Revert "[TRTLLM-5972][chore] Load balance decode token KV cache with helix parallelism"

brb-nv · brb-nv · commit e2241845f3b9 · 2025-12-06T02:44:49.000Z
This reverts commit 6b60df1.
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -694,7 +694,6 @@ def _merge_helix_requests(self, new_requests: list[RequestQueueItem],
                 position_ids=position_ids_this_rank,
             )
             req.total_input_len_cp = input_len
-            req.seqlen_this_rank_cp = len(input_ids_this_rank)
             req_with_children.append(req)
             if req.child_requests:
                 req_with_children.extend(req.child_requests)
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1671,12 +1671,12 @@ def _prepare_tp_inputs(
                     # Warmup doesn't have `total_input_len_cp` set because merge_helix_requests is not called.
                     if not self.is_warmup and not request.is_cuda_graph_dummy:
                         position_id = request.total_input_len_cp + request.py_decoding_iter - 1
-                    if request.py_helix_is_inactive_rank:
-                        past_seen_token_num = request.seqlen_this_rank_cp
+                    # TODO: [TRTLLM-5972] Lift the limitation that last rank is always the active one for helix.
+                    if self.mapping.cp_rank == self.mapping.cp_size - 1:
+                        past_seen_token_num = request.orig_prompt_len + request.py_decoding_iter - 1
                     else:
-                        # Discount the token added to active rank in resource manager as it hasn't
-                        # been previously seen.
-                        past_seen_token_num = request.seqlen_this_rank_cp - 1
+                        # past_seen_token_num doesn't grow on inactive ranks.
+                        past_seen_token_num = request.orig_prompt_len
 
                 position_ids.append(position_id)
                 num_cached_tokens_per_seq.append(past_seen_token_num)
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -468,17 +468,13 @@ def prepare_resources(self, scheduled_batch: ScheduledRequests):
                                 req, block_ids)
 
             for req in generation_batch:
+                # TODO: [TRTLLM-5972] Lift the limitation that last rank is always the active one for helix.
                 if self.mapping.has_cp_helix():
-                    # Distribute the decode blocks across CP ranks in a round-robin manner.
-                    decode_block_id = (req.py_decoding_iter -
-                                       1) // self.tokens_per_block
-                    if decode_block_id % self.mapping.cp_size == self.mapping.cp_rank:
-                        req.py_helix_is_inactive_rank = False
-                        req.seqlen_this_rank_cp += 1
-                    else:
+                    if self.mapping.cp_rank != self.mapping.cp_size - 1:
                         req.py_helix_is_inactive_rank = True
-                        # Skip allocating KV cache at decode for inactive helix ranks.
-                        continue
+                # Skip allocating KV cache at decode for inactive helix ranks.
+                if req.py_helix_is_inactive_rank:
+                    continue
                 self.impl.add_token(req.py_request_id)
                 for _ in range(get_draft_token_length(req)):
                     self.impl.add_token(req.py_request_id)

Original file line number	Diff line number	Diff line change
`@@ -694,7 +694,6 @@ def _merge_helix_requests(self, new_requests: list[RequestQueueItem],`
`694`	`694`	`position_ids=position_ids_this_rank,`
`695`	`695`	`)`
`696`	`696`	`req.total_input_len_cp = input_len`
`697`		`- req.seqlen_this_rank_cp = len(input_ids_this_rank)`
`698`	`697`	`req_with_children.append(req)`
`699`	`698`	`if req.child_requests:`
`700`	`699`	`req_with_children.extend(req.child_requests)`