Merge PR1974 intervl:cache prompt_tokens for sampling metadata

yeonsily · yeonsily · commit b38c808ef9c0 · 2025-09-24T01:44:28.000Z
in sampling if do penalties, the prompt_tokens regenerates for each decode, that takes time. instead
we can use cache it, and reset when requests set changes
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
@@ -197,6 +197,9 @@ def __init__(self):
         # speculative decoding and when prompt embeddings are specified.
         self.include_gpu_probs_tensor = False
         self.should_modify_greedy_probs_inplace = False
+        # Add HPU cache class variables
+        self._prompt_tokens_hpu_cache: Optional[torch.Tensor] = None
+        self._cached_seq_ids: Optional[set] = None
 
     def _init_sampling_tensors(
         self,
@@ -216,8 +219,10 @@ def _init_sampling_tensors(
 
         # Initialize new sampling tensors
         (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p,
-         top_k_scalar, top_p_scalar) = SamplingTensors.from_sampling_metadata(
-             sampling_metadata, vocab_size, logits.device, logits.dtype)
+         top_k_scalar, top_p_scalar, current_seq_ids) = \
+            SamplingTensors.from_sampling_metadata(
+             sampling_metadata, vocab_size, logits.device, logits.dtype, \
+             self._prompt_tokens_hpu_cache, self._cached_seq_ids)
 
         self._sampling_tensors = sampling_tensors
         self._do_penalties = do_penalties
@@ -227,6 +232,12 @@ def _init_sampling_tensors(
         self._top_p_scalar = top_p_scalar
 
         self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5)
+        # Check if batch composition changed - if so, invalidate prompt cache
+
+        # After tensors are created, update cache
+        if self._cached_seq_ids != current_seq_ids:
+            self._prompt_tokens_hpu_cache = None
+            self._cached_seq_ids = current_seq_ids
 
     def forward(
         self,
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
@@ -16,6 +16,8 @@
                         make_tensor_with_pad_align)
 
 _SAMPLING_EPS = 1e-5
+pin_memory = is_pin_memory_available()
+is_hpu = current_platform.is_hpu()
 
 
 @dataclass
@@ -286,7 +288,7 @@ def _prepare_seq_groups(
 
         if seq_group_metadata.is_prompt:
             if sampling_params.seed is not None:
-                if current_platform.is_hpu():
+                if is_hpu:
                     import habana_frameworks.torch.hpu.random as htrandom
                     generator = \
                         htrandom.default_generators[
@@ -420,8 +422,10 @@ def from_sampling_metadata(
         vocab_size: int,
         device: torch.device,
         dtype: torch.dtype,
+        prompt_tokens_cache: torch.tensor,
+        past_seq_ids: set,
     ) -> tuple["SamplingTensors", bool, bool, bool, Optional[int],
-               Optional[float]]:
+               Optional[float], Optional[torch.tensor]]:
         prompt_tokens: list[array] = []
         output_tokens: list[array] = []
         top_ks: list[int] = []
@@ -434,6 +438,7 @@ def from_sampling_metadata(
         do_penalties = False
         do_top_p_top_k = False
         do_min_p = False
+        current_seq_ids = set()
 
         assert sampling_metadata.seq_groups is not None
         for seq_group in sampling_metadata.seq_groups:
@@ -508,6 +513,9 @@ def from_sampling_metadata(
                         seq_data = seq_group.seq_data[seq_id]
                         prompt_tokens.append(seq_data.prompt_token_ids_array)
                         output_tokens.append(seq_data.output_token_ids_array)
+                        current_seq_ids.update(seq_ids)
+            if current_seq_ids != past_seq_ids:
+                prompt_tokens_cache = None
 
         top_k_scalar = top_ks[0] if do_top_p_top_k and all(
             k == top_ks[0] for k in top_ks) else None
@@ -527,9 +535,10 @@ def from_sampling_metadata(
             vocab_size,
             device,
             dtype,
+            prompt_tokens_cache,
         )
         return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p,
-                top_k_scalar, top_p_scalar)
+                top_k_scalar, top_p_scalar, current_seq_ids)
 
     @classmethod
     def from_lists(
@@ -546,23 +555,28 @@ def from_lists(
         vocab_size: int,
         device: torch.device,
         dtype: torch.dtype,
+        prompt_tokens_cache: torch.tensor,
     ) -> "SamplingTensors":
         # Note that the performance will be very bad without
         # pinned memory.
-        pin_memory = is_pin_memory_available()
 
         do_penalties = prompt_tokens or output_tokens
 
         if do_penalties:
-            if current_platform.is_hpu():
-                prompt_t = make_tensor_with_pad_align(
-                    prompt_tokens,
-                    vocab_size,
-                    device="cpu",
-                    dtype=torch.int64,
-                    pin_memory=pin_memory,
-                    max_len_align=1024,
-                )
+            if is_hpu:
+                if (prompt_tokens_cache is not None and
+                    prompt_tokens_cache.device == device):
+                    # Reuse cached prompt_tokens already on HPU
+                    prompt_t = prompt_tokens_cache
+                else:
+                    prompt_t = make_tensor_with_pad_align(
+                        prompt_tokens,
+                        vocab_size,
+                        device="cpu",
+                        dtype=torch.int64,
+                        pin_memory=pin_memory,
+                        max_len_align=1024,
+                    )
                 output_t = make_tensor_with_pad_align(
                     output_tokens,
                     vocab_size,
@@ -647,6 +661,6 @@ def from_lists(
                                                          non_blocking=True),
             repetition_penalties=repetition_penalties_t.to(device=device,
                                                            non_blocking=True),
-            prompt_tokens=prompt_t.to(device=device, non_blocking=True),
+            prompt_tokens=prompt_t.to(device=device, non_blocking=True) if prompt_t.device != device else prompt_t,
             output_tokens=output_t.to(device=device, non_blocking=True),
         )
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -4023,6 +4023,12 @@ def try_revert_dummy_output_tokens():
                         self.cached_step_inputs.append(model_input)
                 if self.do_mark_step:
                     htorch.core.mark_step()
+                if hasattr(self.model.sampler, '_sampling_tensors') and \
+                    self.model.sampler._sampling_tensors is not None:
+                    sampling_tensors = self.model.sampler._sampling_tensors
+                    if sampling_tensors.prompt_tokens.numel() > 0:
+                        # Cache the prompt_tokens tensor that's already on HPU
+                        self.model.sampler._prompt_tokens_hpu_cache = sampling_tensors.prompt_tokens
                 if use_delayed_sampling \
                    and model_input.async_callback is not None:
                     model_input.async_callback()