4
4
5
5
#pragma once
6
6
7
- #include < cmath>
8
- #include < cstddef>
9
- #include < memory>
10
- #include < queue>
11
-
12
7
#include " openvino/op/util/attr_types.hpp"
13
8
#include " openvino/reference/matmul.hpp"
14
9
#include " openvino/reference/normalize_l2.hpp"
15
10
#include " openvino/reference/reduce_mean.hpp"
16
11
#include " openvino/reference/slice.hpp"
17
- #include " openvino/runtime/tensor.hpp"
18
12
19
13
namespace ov ::reference {
20
14
@@ -27,7 +21,7 @@ class AdaptiveRKVDiversityCalculator {
27
21
* calculation, starting from the beginning of the token dimension ("start area"). Must be a multiple of
28
22
* `block_size`.
29
23
* @param eviction_size Size, in tokens, from the beginning of the start area, the tokens in which will be
30
- * considred for purposes of diversity calculation ("eviction area"). The rest of the tokens after the eviction
24
+ * considered for purposes of diversity calculation ("eviction area"). The rest of the tokens after the eviction
31
25
* area, if any, are ignored. Must be a multiple of `block_size`.
32
26
* @param block_size Block size of the underlying paged attention implementation. The diversity values will be
33
27
* sum-reduced from per-token values to per-block values based on this number of tokens in a block.
@@ -92,6 +86,8 @@ class AdaptiveRKVDiversityCalculator {
92
86
}
93
87
94
88
/* * For a square matrix, sums each `block_size`-sized group of matrix rows to produce a row in the output matrix.
89
+ * In the overall algorithm context, each summed value represents diversity (the negative of inter-token cosine
90
+ * similarity), where larger absolute values indicate greater diversity.
95
91
* @param in_data Pointer to the matrix data.
96
92
* @param in_shape Shape of the matrix data. Expected shape is [token_dim, token_dim], where token_dim must be a
97
93
* multiple of `block_size`.
@@ -130,7 +126,7 @@ class AdaptiveRKVDiversityCalculator {
130
126
* block in the eviction area. Due to implementation specifics the paged attention kernel does not know ahead of
131
127
* time which blocks will be "retained" - this information is only available on the openvino.genai level after the
132
128
* PA kernel has executed. Therefore the PA kernel will provide raw per-token values on the rank 1 of the returned
133
- * diversity value matrix and delegatei the final reduce-mean and filtering to the openvino.genai level.
129
+ * diversity value matrix and delegate the final reduce-mean and filtering to the openvino.genai level.
134
130
* @param key_data Pointer to the key cache tensor data
135
131
* @param key_shape Shape of the key input tensor data. Expected shape is [num_heads, num_key_tokens, head_size],
136
132
* where `num_key_tokens` must be no less than `start_size + eviction_size`.
@@ -165,15 +161,14 @@ class AdaptiveRKVDiversityCalculator {
165
161
166
162
Shape evictable_subset_shape = {key_shape[0 ], m_eviction_size, m_eviction_size};
167
163
auto evictable_subset_buf = allocate_buf (evictable_subset_shape);
168
- // stops?
169
164
ov::reference::slice (reinterpret_cast <char *>(cos_similar_buf.get ()),
170
165
cos_similar_shape,
171
166
reinterpret_cast <char *>(evictable_subset_buf.get ()),
172
167
evictable_subset_shape,
173
168
sizeof (T),
174
169
/* starts = */ {m_start_size, m_start_size},
175
170
/* steps = */ {1 , 1 },
176
- /* axes = */ {1 , 2 });
171
+ /* axes = */ {1 , 2 }); // stops are defined by output shape
177
172
cos_similar_buf.reset ();
178
173
179
174
fill_diagonal_ (evictable_subset_buf.get (), evictable_subset_shape, 0.0 );
0 commit comments