[TRTLLM-9687][perf] Improve performance of _are_end_id

stnie · stnie · commit 54b5acbfcb3e · 2026-01-21T09:33:32.000Z
- Introduced `end_ids` tensor in the `Store` class to store end IDs for each request.
- Updated `setup_sampler_step` to fill `end_ids` based on request parameters.
- Refactored `_are_end_id` method to utilize the new `end_ids` tensor for better performance.

Signed-off-by: Stefan Niebler &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -932,6 +932,9 @@ class Store:
         max_lengths_tensor: torch.Tensor
         """Shape: batch_size
            Usage: Stores the maximum lengths for each request"""
+        end_ids: torch.Tensor
+        """Shape: batch_size
+           Usage: Stores the end ids for each request"""
         finish_reasons: torch.Tensor
         """Shape: max_tokens, batch_size, beam_width
            Usage: Stores the currently estimated finish_reasons for each request"""
@@ -977,7 +980,8 @@ def _create_store(self) -> Store:
         # Tensors necessary for all sampling methods
         new_tokens = int_tensor(self.NEW_TOKENS_SHAPE)
         finish_reasons = int_tensor(self.NEW_TOKENS_SHAPE)
-        max_lengths_tensor=int_tensor(self.max_num_sequences),
+        max_lengths_tensor=int_tensor(self.max_num_sequences)
+        end_ids=int_tensor(self.max_num_sequences)
 
         # Only used for logprobs processing or beam search
         sampled_log_probs = torch.empty(self.LOGPROBS_SHAPE, device="cuda", dtype=torch.float32)
@@ -1012,6 +1016,7 @@ def _create_store(self) -> Store:
             new_tokens=new_tokens,
             finish_reasons=finish_reasons,
             max_lengths_tensor=max_lengths_tensor,
+            end_ids=end_ids,
             cache_indirection=cache_indirection,
             cache_indirection_buffer=cache_indirection_buffer,
             cum_log_probs=cum_log_probs,
@@ -1549,6 +1554,9 @@ def setup_sampler_step(self, requests: ScheduledRequests):
                 self.store.max_lengths_tensor[request.py_seq_slot].fill_(
                     min(self.max_seq_len, request.orig_prompt_len + request.py_max_new_tokens)
                 )
+                self.store.end_ids[request.py_seq_slot].fill_(
+                    request.py_end_id if request.py_end_id is not None else -1
+                )
 
     def _prepare_beam_search(
         self,
@@ -2822,7 +2830,7 @@ def _write_finish_reasons(
             batched_finish_reasons,
         )
         batched_finish_reasons = torch.where(
-            self._are_end_id(requests, tokens),
+            self._are_end_id(self.store.end_ids[seq_slots], tokens),
             self._reason_tensors[FinishReason.END_ID],
             batched_finish_reasons,
         )
@@ -2838,21 +2846,8 @@ def _write_finish_reasons(
             )
             first_finish_reasons[seq_slots] = batched_first_finish_reasons
 
-    def _are_end_id(self, requests: list[LlmRequest], tokens: torch.Tensor) -> torch.Tensor:
-        end_ids_tensor = (
-            torch.tensor(
-                [
-                    ([req.py_end_id if req.py_end_id is not None else -1] * self.max_beam_width)
-                    for req in requests
-                ]
-                * self.max_tokens,
-                pin_memory=True,
-                dtype=tokens.dtype,
-            )
-            .view(self.max_tokens, len(requests), self.max_beam_width)
-            .to(device="cuda", non_blocking=True)
-        )
-        return tokens == end_ids_tensor
+    def _are_end_id(self, end_ids: torch.Tensor, tokens: torch.Tensor) -> torch.Tensor:
+        return tokens == end_ids.view(1, -1, 1).expand(self.max_tokens, -1, self.max_beam_width)
 
     def _are_max_length(self, seq_lens: torch.Tensor, max_seq_lens: torch.Tensor) -> torch.Tensor:
         """Checks which sequences are at or beyond the max length