vllm-project · jkaniecki · Aug 5, 2025 · Aug 11, 2025 · Aug 11, 2025 · Aug 11, 2025
@@ -27,5 +27,4 @@ title: Supported Features
 | Multinode support   | vLLM HPU backend supports distributed, multiple-node inference with Ray.    | <REF>  |
 | vLLM v1 architecture (early release)   | V1 architecture is now available for the HPU backend, and will gradually enable it for every use case we plan to support.   | [Documentation](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) |
 | Guided decode   | vLLM HPU supports a guided decoding backend for generating structured outputs.   | [Documentation](https://docs.vllm.ai/en/latest/features/structured_outputs.html)  |
-| Delayed Sampling  (experimental) | vLLM HPU supports delayed sampling scheduling for asynchronous execution, enabled by `VLLM_DELAYED_SAMPLING=true` environment variable.   | N/A |
 | Exponential bucketing | vLLM HPU supports exponential bucketing spacing instead of linear to automate configuration of bucketing mechanism, enabled by default. It can be disabled via `VLLM_EXPONENTIAL_BUCKETING=false` environment variable.   | N/A |
@@ -5,7 +5,6 @@
 
 if TYPE_CHECKING:
     VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
-    VLLM_HPU_USE_DELAYED_SAMPLING: bool = False
     VLLM_HPU_FORCE_CHANNEL_FP8: bool = True
 
 # The begin-* and end* here are used by the documentation generator
@@ -20,12 +19,6 @@
     lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
     ("1", "true"),
 
-    # Use delayed sampling for HPU to reduce host cpu overhead
-    # between each step.
-    "VLLM_HPU_USE_DELAYED_SAMPLING":
-    lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in
-    ("1", "true"),
-
     # Convert block fp8 to channel fp8 for HPU
     "VLLM_HPU_FORCE_CHANNEL_FP8":
     lambda: os.environ.get("VLLM_HPU_FORCE_CHANNEL_FP8", "true").lower() in

@@ -71,14 +71,15 @@ def get_features():
         Value('skip_warmup', False),
         Value('merged_prefill', False),
         Value('use_contiguous_pa', Disabled('prefix_caching'), env_var='VLLM_CONTIGUOUS_PA'),
-        Value('use_delayed_sampling', Engine('v0'), env_var='VLLM_DELAYED_SAMPLING'),
         Value('use_bucketing', True, env_var='VLLM_ENABLE_BUCKETING'),
         Value('exponential_bucketing', True),
         Value('linear_bucketing', True),
+        Value('lookahead_decoding', False, env_var='VLLM_USE_LOOKAHEAD_DECODING'),
         ValueFromList('bucketing_strategy', bucketing_strategies),
         Value('defrag', False),
         Value('regional_compilation', True, env_var='VLLM_T_COMPILE_REGIONAL_COMPILATION', env_var_type=boolean),
         Value('dynamic_shapes_compilation', True, env_var='VLLM_T_COMPILE_DYNAMIC_SHAPES', env_var_type=boolean),
         Value('fullgraph_compilation', False, env_var='VLLM_T_COMPILE_FULLGRAPH', env_var_type=boolean),
+
     ]
     return split_values_and_flags(features)
@@ -605,14 +605,7 @@ def make_selective_sampling_metadata(
             for req_id, _ in req_id_output_token_ids
         ]
         prompt_token_ids = None
-        if not skip_copy:
-            self.temperature[req_indices].copy_(
-                self.temperature_cpu_tensor[req_indices], non_blocking=True)
-            self.top_p[req_indices].copy_(self.top_p_cpu_tensor[req_indices],
-                                          non_blocking=True)
-            self.top_k[req_indices].copy_(self.top_k_cpu_tensor[req_indices],
-                                          non_blocking=True)
-            if not self.no_penalties:
+        if not skip_copy and not self.no_penalties:
                 # Since syncing these tensors is expensive only copy them
                 # if necessary i.e. if there are requests which require
                 # penalties to be applied during sampling.