Reject requests with logprob when using fast prefill

sarckk · sarckk · commit 2a54824c3327 · 2025-08-22T18:57:14.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -69,7 +69,6 @@ class CommonAttentionMetadata:
 
     logits_indices_padded: Optional[torch.Tensor] = None
     num_logits_indices: Optional[int] = None
-    prompt_logprobs: Optional[bool] = None
 
     causal: bool = True
 
@@ -837,25 +836,13 @@ def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
               fast_build: bool = False) -> AttentionMetadata:
-        # Either not set (None) or prompt_logprobs is False
-        if not common_attn_metadata.prompt_logprobs:
-            # Fast prefill path
-            new_common_attn_metadata =\
-            make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
-            metadata = super(self.__class__,
-                             self).build(common_prefix_len,
-                                         new_common_attn_metadata, fast_build)
-            return create_kv_sharing_fast_prefill_attn_metadata_subclass(
-                metadata, common_attn_metadata)
-
-        # Default path:
-        # Either --kv-sharing-fast-prefill is not set or at least one request
-        # in the current scheduling round requests logprobs for prompt tokens
-        # which is not compatible with fast prefill
+        new_common_attn_metadata =\
+        make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
         metadata = super(self.__class__,
-                         self).build(common_prefix_len, common_attn_metadata,
-                                     fast_build)
-        return metadata
+                         self).build(common_prefix_len,
+                                     new_common_attn_metadata, fast_build)
+        return create_kv_sharing_fast_prefill_attn_metadata_subclass(
+            metadata, common_attn_metadata)
 
     # Dynamically create a new attention backend that wraps the
     # underlying attention backend but applies
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -335,6 +335,11 @@ async def generate(
         returning the RequestOutput back to the caller.
         """
 
+        if (self.vllm_config.cache_config.kv_sharing_fast_prefill
+                and sampling_params.prompt_logprobs):
+            raise ValueError(
+                "Fast prefill produces incorrect logprobs for prompt tokens")
+
         try:
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -846,14 +846,6 @@ def _prepare_inputs(
             self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs])
         spec_decode_common_attn_metadata = None
 
-        if (self.cache_config.kv_sharing_fast_prefill
-                and self.input_batch.num_prompt_logprobs):
-            logger.warning(
-                "Encountered at least one request with prompt_logprobs set "
-                "with --kv-sharing-fast-prefill enabled. Fast prefill doesn't "
-                "produce correct logits for prompt tokens, so fast prefill will"
-                " be disabled for this iteration.")
-
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
@@ -901,7 +893,6 @@ def _prepare_inputs(
                 slot_mapping=slot_mapping,
                 logits_indices_padded=logits_indices_padded,
                 num_logits_indices=logits_indices.size(0),
-                prompt_logprobs=len(self.input_batch.num_prompt_logprobs) > 0,
                 causal=True,
             )