Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/features/supported_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,4 @@ title: Supported Features
| Multinode support | vLLM HPU backend supports distributed, multiple-node inference with Ray. | <REF> |
| vLLM v1 architecture (early release) | V1 architecture is now available for the HPU backend, and will gradually enable it for every use case we plan to support. | [Documentation](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) |
| Guided decode | vLLM HPU supports a guided decoding backend for generating structured outputs. | [Documentation](https://docs.vllm.ai/en/latest/features/structured_outputs.html) |
| Delayed Sampling (experimental) | vLLM HPU supports delayed sampling scheduling for asynchronous execution, enabled by `VLLM_DELAYED_SAMPLING=true` environment variable. | N/A |
| Exponential bucketing | vLLM HPU supports exponential bucketing spacing instead of linear to automate configuration of bucketing mechanism, enabled by default. It can be disabled via `VLLM_EXPONENTIAL_BUCKETING=false` environment variable. | N/A |
7 changes: 0 additions & 7 deletions vllm_gaudi/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

if TYPE_CHECKING:
VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
VLLM_HPU_USE_DELAYED_SAMPLING: bool = False
VLLM_HPU_FORCE_CHANNEL_FP8: bool = True

# The begin-* and end* here are used by the documentation generator
Expand All @@ -20,12 +19,6 @@
lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
("1", "true"),

# Use delayed sampling for HPU to reduce host cpu overhead
# between each step.
"VLLM_HPU_USE_DELAYED_SAMPLING":
lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in
("1", "true"),

# Convert block fp8 to channel fp8 for HPU
"VLLM_HPU_FORCE_CHANNEL_FP8":
lambda: os.environ.get("VLLM_HPU_FORCE_CHANNEL_FP8", "true").lower() in
Expand Down
3 changes: 2 additions & 1 deletion vllm_gaudi/extension/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,15 @@ def get_features():
Value('skip_warmup', False),
Value('merged_prefill', False),
Value('use_contiguous_pa', Disabled('prefix_caching'), env_var='VLLM_CONTIGUOUS_PA'),
Value('use_delayed_sampling', Engine('v0'), env_var='VLLM_DELAYED_SAMPLING'),
Value('use_bucketing', True, env_var='VLLM_ENABLE_BUCKETING'),
Value('exponential_bucketing', True),
Value('linear_bucketing', True),
Value('lookahead_decoding', False, env_var='VLLM_USE_LOOKAHEAD_DECODING'),
ValueFromList('bucketing_strategy', bucketing_strategies),
Value('defrag', False),
Value('regional_compilation', True, env_var='VLLM_T_COMPILE_REGIONAL_COMPILATION', env_var_type=boolean),
Value('dynamic_shapes_compilation', True, env_var='VLLM_T_COMPILE_DYNAMIC_SHAPES', env_var_type=boolean),
Value('fullgraph_compilation', False, env_var='VLLM_T_COMPILE_FULLGRAPH', env_var_type=boolean),

]
return split_values_and_flags(features)
9 changes: 1 addition & 8 deletions vllm_gaudi/v1/worker/hpu_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,14 +605,7 @@ def make_selective_sampling_metadata(
for req_id, _ in req_id_output_token_ids
]
prompt_token_ids = None
if not skip_copy:
self.temperature[req_indices].copy_(
self.temperature_cpu_tensor[req_indices], non_blocking=True)
self.top_p[req_indices].copy_(self.top_p_cpu_tensor[req_indices],
non_blocking=True)
self.top_k[req_indices].copy_(self.top_k_cpu_tensor[req_indices],
non_blocking=True)
if not self.no_penalties:
if not skip_copy and not self.no_penalties:
# Since syncing these tensors is expensive only copy them
# if necessary i.e. if there are requests which require
# penalties to be applied during sampling.
Expand Down
Loading