Precalculate offset tensor only once and reuse it in _are_max_length

stnie · stnie · commit d55f8e53e7db · 2026-01-21T15:52:56.000Z
Signed-off-by: Stefan Niebler &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -980,8 +980,8 @@ def _create_store(self) -> Store:
         # Tensors necessary for all sampling methods
         new_tokens = int_tensor(self.NEW_TOKENS_SHAPE)
         finish_reasons = int_tensor(self.NEW_TOKENS_SHAPE)
-        max_lengths_tensor=int_tensor(self.max_num_sequences)
-        end_ids=int_tensor(self.max_num_sequences)
+        max_lengths_tensor = int_tensor(self.max_num_sequences)
+        end_ids = int_tensor(self.max_num_sequences)
 
         # Only used for logprobs processing or beam search
         sampled_log_probs = torch.empty(self.LOGPROBS_SHAPE, device="cuda", dtype=torch.float32)
@@ -1082,6 +1082,9 @@ def __init__(self, args: Args):
                     FinishReason.CANCELLED,
                 ]  # `in FinishReason` clashes with PyBind11: `TypeError: 'pybind11_type' object is not iterable`
             }
+            self._max_tokens_offset = torch.arange(
+                1, self.max_tokens + 1, device="cuda", dtype=torch.int32
+            ).view(1, 1, -1)
 
         self._grouped_sampler_cls: Type[GroupedStrategySampler]
         if IS_FLASHINFER_AVAILABLE and not args.disable_flashinfer_sampling:
@@ -2864,12 +2867,9 @@ def _are_max_length(self, seq_lens: torch.Tensor, max_seq_lens: torch.Tensor) ->
             A tensor of shape (max_tokens, len(requests), max_beam_width)
             where each element is True if the sequence is at or beyond the max length, False otherwise
         """
-        lengths_tensor = (
-            seq_lens.view(1, -1, 1)
-            + torch.arange(
-                1, self.max_tokens + 1, device=seq_lens.device, dtype=seq_lens.dtype
-            ).view(-1, 1, 1)
-        ).expand(self.max_tokens, -1, self.max_beam_width)
+        lengths_tensor = (seq_lens.view(1, -1, 1) + self._max_tokens_offset).expand(
+            self.max_tokens, -1, self.max_beam_width
+        )
         max_lengths_tensor = max_seq_lens.view(1, -1, 1).expand(
             self.max_tokens, -1, self.max_beam_width
         )