Skip to content

Commit 78c13e3

Browse files
authored
[V1] Fix local chunked attention always disabled (#21419)
Signed-off-by: Yong Hoon Shin <[email protected]>
1 parent 5c9b807 commit 78c13e3

File tree

1 file changed

+2
-1
lines changed

1 file changed

+2
-1
lines changed

vllm/attention/layer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ def __init__(
143143
# the backends)
144144
if envs.VLLM_USE_V1:
145145
self.use_irope = extra_impl_args.pop("use_irope", False)
146+
else:
147+
self.use_irope = extra_impl_args.get("use_irope", False)
146148

147149
quant_method = quant_config.get_quant_method(
148150
self, prefix=prefix) if quant_config else None
@@ -177,7 +179,6 @@ def __init__(
177179
kv_sharing_target_layer_name, **extra_impl_args)
178180
self.backend = backend_name_to_enum(attn_backend.get_name())
179181
self.dtype = dtype
180-
self.use_irope = extra_impl_args.get("use_irope", False)
181182

182183
# For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
183184
# torch.compile works by registering the attention as one giant

0 commit comments

Comments
 (0)