|
1 | 1 | # Copyright (C) 2018-2025 Intel Corporation |
2 | 2 | # SPDX-License-Identifier: Apache-2.0 |
3 | | - |
| 3 | +from genai_opt import SparseAttention |
| 4 | +from genai_opt import KVCacheCompressionMode, KVCacheCompressionParameters, KVCacheCompressor |
4 | 5 |
|
5 | 6 | def add_visual_pruning_args(parser): |
6 | 7 | group = parser.add_argument_group("Visual Token Pruning Arguments") |
@@ -28,3 +29,53 @@ def add_attention_args(parser): |
28 | 29 | help="Window size of recent tokens each query can attend to in the Tri-shape pattern" |
29 | 30 | ) |
30 | 31 | return parser |
| 32 | + |
| 33 | + |
| 34 | +def add_token_eviction_args(parser): |
| 35 | + group = parser.add_argument_group("Token Eviction Arguments") |
| 36 | + group.add_argument("--enable_eviction", action="store_true", help="Enable token eviction") |
| 37 | + group.add_argument("--algorithm", default="snapkv", choices=["snapkv", "h2o"], help="The KV cache eviction algorithm") |
| 38 | + group.add_argument("--granularity", default="per_group", choices=["per_token", "per_group"], help="Eviction granularity") |
| 39 | + group.add_argument( |
| 40 | + "--normalize_scores", |
| 41 | + action="store_true", |
| 42 | + help="Whether to normalize the attention scores by the number of times each token was attended to." |
| 43 | + ) |
| 44 | + group.add_argument( |
| 45 | + "--start_tokens", |
| 46 | + type=int, |
| 47 | + default=32, |
| 48 | + help="The number of tokens in the beginning of the cache (least recent) to be retained" |
| 49 | + ) |
| 50 | + group.add_argument("--intermediate_tokens", type=int, default=1024, help="The number of intermediate tokens to consider for eviction") |
| 51 | + group.add_argument("--recent_tokens", type=int, default=128, help="The number of most recent tokens to be retained") |
| 52 | + group.add_argument("--group_size", type=int, default=32, help="Group size for per-group eviction strategy") |
| 53 | + group.add_argument("--window_size", type=int, default=None, help="The size of the importance score aggregation window") |
| 54 | + return parser |
| 55 | + |
| 56 | + |
| 57 | +def get_sparse_attention_patcher(args): |
| 58 | + print(f"Enable custom attention kernel with {args.prefill_impl} implementation") |
| 59 | + return SparseAttention( |
| 60 | + algorithm=args.prefill_impl, |
| 61 | + threshold=args.threshold, |
| 62 | + recent_size=args.recent_size, |
| 63 | + last_query_size=args.last_query_size, |
| 64 | + output_attentions=args.enable_eviction, # output attention weights only if eviction is enabled |
| 65 | + ) |
| 66 | + |
| 67 | + |
| 68 | +def get_eviction_patcher(args): |
| 69 | + print(f"Enable token eviction with {args.algorithm} algorithm") |
| 70 | + algorithm = KVCacheCompressionMode(args.algorithm) |
| 71 | + params = KVCacheCompressionParameters( |
| 72 | + algorithm=algorithm, |
| 73 | + granularity=args.granularity, |
| 74 | + group_size=args.group_size, |
| 75 | + start_tokens=args.start_tokens, |
| 76 | + recent_tokens=args.recent_tokens, |
| 77 | + intermediate_tokens=args.intermediate_tokens, |
| 78 | + normalize_scores=args.normalize_scores, |
| 79 | + window_size=args.window_size, |
| 80 | + ) |
| 81 | + return KVCacheCompressor(eviction_parameters=params) |
0 commit comments