@@ -72,9 +72,6 @@ def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
72
72
vllm_config .parallel_config )
73
73
self .headdim = model_config .get_head_size ()
74
74
75
- self .attention_chunk_size = getattr (vllm_config .scheduler_config ,
76
- 'attention_chunk_size' , None )
77
-
78
75
def build_for_cudagraph_capture (
79
76
self , common_attn_metadata : CommonAttentionMetadata
80
77
) -> TritonAttentionMetadata :
@@ -208,7 +205,6 @@ def __init__(
208
205
logits_soft_cap : Optional [float ] = None ,
209
206
attn_type : AttentionType = AttentionType .DECODER ,
210
207
kv_sharing_target_layer_name : Optional [int ] = None ,
211
- use_irope : bool = False ,
212
208
) -> None :
213
209
self .num_heads = num_heads
214
210
self .head_size = head_size
@@ -228,8 +224,6 @@ def __init__(
228
224
self .logits_soft_cap = logits_soft_cap
229
225
self .kv_sharing_target_layer_name = kv_sharing_target_layer_name
230
226
231
- self .use_irope = use_irope
232
-
233
227
self .num_queries_per_kv = self .num_heads // self .num_kv_heads
234
228
235
229
TritonAttentionBackend .validate_head_size (head_size )
0 commit comments