File tree Expand file tree Collapse file tree 8 files changed +52
-0
lines changed
Expand file tree Collapse file tree 8 files changed +52
-0
lines changed Original file line number Diff line number Diff line change 3838from vllm .attention .layer import Attention
3939from vllm .attention .ops .paged_attn import PagedAttention
4040from vllm .config import VllmConfig , get_current_vllm_config
41+ from vllm .logger import init_logger
4142from vllm .utils import (async_tensor_h2d , get_kv_cache_torch_dtype ,
4243 make_tensor_with_pad )
4344
45+ logger = init_logger (__name__ )
46+
4447if TYPE_CHECKING :
4548 from vllm .worker .model_runner import (ModelInputForGPUBuilder ,
4649 ModelInputForGPUWithSamplingMetadata )
@@ -907,7 +910,12 @@ def __init__(
907910 blocksparse_params : Optional [Dict [str , Any ]] = None ,
908911 logits_soft_cap : Optional [float ] = None ,
909912 attn_type : str = AttentionType .DECODER ,
913+ use_irope : bool = False ,
910914 ) -> None :
915+ if use_irope :
916+ logger .warning_once (
917+ "Using irope in FlashInfer is not supported yet, it will fall"
918+ " back to global attention for long context." )
911919 self .num_heads = num_heads
912920 self .head_size = head_size
913921 self .scale = float (scale )
Original file line number Diff line number Diff line change @@ -108,8 +108,13 @@ def __init__(
108108 blocksparse_params : Optional [Dict [str , Any ]] = None ,
109109 max_seq_len : int = 4096 ,
110110 attn_type : str = AttentionType .DECODER ,
111+ use_irope : bool = False ,
111112 ) -> None :
112113 super (AttentionImpl , self ).__init__ ()
114+ if use_irope :
115+ logger .warning_once (
116+ "Using irope in HPU is not supported yet, it will fall back "
117+ "to global attention for long context." )
113118 self .kv_cache_dtype = kv_cache_dtype
114119 self .num_heads = num_heads
115120 self .head_size = head_size
Original file line number Diff line number Diff line change 1414from vllm .attention .backends .utils import CommonAttentionState
1515from vllm .attention .ops .paged_attn import (PagedAttention ,
1616 PagedAttentionMetadata )
17+ from vllm .logger import init_logger
18+
19+ logger = init_logger (__name__ )
1720
1821_PARTITION_SIZE = 512
1922
@@ -119,7 +122,12 @@ def __init__(
119122 blocksparse_params : Optional [Dict [str , Any ]] = None ,
120123 logits_soft_cap : Optional [float ] = None ,
121124 attn_type : str = AttentionType .DECODER ,
125+ use_irope : bool = False ,
122126 ) -> None :
127+ if use_irope :
128+ logger .warning_once (
129+ "Using irope in Ipex is not supported yet, it will fall"
130+ " back to global attention for long context." )
123131 if blocksparse_params is not None :
124132 raise ValueError (
125133 "IPEX backend does not support block-sparse attention." )
Original file line number Diff line number Diff line change 1111 AttentionMetadata , AttentionType ,
1212 is_quantized_kv_cache )
1313from vllm .attention .backends .utils import CommonAttentionState
14+ from vllm .logger import init_logger
15+
16+ logger = init_logger (__name__ )
1417
1518
1619class PallasAttentionBackend (AttentionBackend ):
@@ -105,7 +108,12 @@ def __init__(
105108 blocksparse_params : Optional [Dict [str , Any ]] = None ,
106109 logits_soft_cap : Optional [float ] = None ,
107110 attn_type : str = AttentionType .DECODER ,
111+ use_irope : bool = False ,
108112 ) -> None :
113+ if use_irope :
114+ logger .warning_once (
115+ "Using irope in Pallas is not supported yet, it will fall back "
116+ "to global attention for long context." )
109117 self .num_heads = num_heads
110118 self .head_size = head_size
111119 self .scale = float (scale )
Original file line number Diff line number Diff line change @@ -462,7 +462,12 @@ def __init__(
462462 blocksparse_params : Optional [Dict [str , Any ]] = None ,
463463 logits_soft_cap : Optional [float ] = None ,
464464 attn_type : str = AttentionType .DECODER ,
465+ use_irope : bool = False ,
465466 ) -> None :
467+ if use_irope :
468+ logger .warning_once (
469+ "Using irope in ROCm Flash Attention is not supported yet, it "
470+ "will fail back to global attention for long context." )
466471 if blocksparse_params is not None :
467472 raise ValueError (
468473 "ROCmFlashAttention does not support blocksparse attention." )
Original file line number Diff line number Diff line change @@ -404,13 +404,18 @@ def __init__(
404404 blocksparse_params : Optional [Dict [str , Any ]] = None ,
405405 logits_soft_cap : Optional [float ] = None ,
406406 attn_type : str = AttentionType .DECODER ,
407+ use_irope : bool = False ,
407408 ) -> None :
408409 if blocksparse_params is not None :
409410 raise ValueError (
410411 "Torch SPDA does not support block-sparse attention." )
411412 if logits_soft_cap is not None :
412413 logger .warning_once ("Torch SPDA does not support logits soft cap. "
413414 "Outputs may be slightly off." )
415+ if use_irope :
416+ logger .warning_once (
417+ "Using irope in Torch SPDA is not supported yet, it will fall"
418+ " back to global attention for long context." )
414419 self .num_heads = num_heads
415420 self .head_size = head_size
416421 self .scale = float (scale )
Original file line number Diff line number Diff line change @@ -389,13 +389,18 @@ def __init__(
389389 blocksparse_params : Optional [Dict [str , Any ]] = None ,
390390 logits_soft_cap : Optional [float ] = None ,
391391 attn_type : str = AttentionType .DECODER ,
392+ use_irope : bool = False ,
392393 ) -> None :
393394 if blocksparse_params is not None :
394395 raise ValueError (
395396 "XFormers does not support block-sparse attention." )
396397 if logits_soft_cap is not None :
397398 logger .warning_once ("XFormers does not support logits soft cap. "
398399 "Outputs may be slightly off." )
400+ if use_irope :
401+ logger .warning_once (
402+ "Using irope in XFormers is not supported yet, it will fall"
403+ " back to global attention for long context." )
399404 self .num_heads = num_heads
400405 self .head_size = head_size
401406 self .scale = float (scale )
Original file line number Diff line number Diff line change 1010from vllm .attention .backends .abstract import (AttentionBackend , AttentionImpl ,
1111 AttentionLayer , AttentionType )
1212from vllm .attention .backends .utils import CommonAttentionState
13+ from vllm .logger import init_logger
14+
15+ logger = init_logger (__name__ )
1316
1417
1518class PallasAttentionBackend (AttentionBackend ):
@@ -80,7 +83,12 @@ def __init__(
8083 blocksparse_params : Optional [dict [str , Any ]] = None ,
8184 logits_soft_cap : Optional [float ] = None ,
8285 attn_type : str = AttentionType .DECODER ,
86+ use_irope : bool = False ,
8387 ) -> None :
88+ if use_irope :
89+ logger .warning_once (
90+ "Using irope in Pallas is not supported yet, it will fall back "
91+ "to global attention for long context." )
8492 if blocksparse_params is not None :
8593 raise ValueError ("Paged attention Pallas kernel does "
8694 "not support block-sparse attention." )
You can’t perform that action at this time.
0 commit comments