18
18
19
19
namespace ov ::reference {
20
20
21
-
22
21
/* * @brief Reference implementation of the Adaptive R-KV token diversity calculation mechanism
23
22
* (https://arxiv.org/pdf/2505.24133v3) */
24
23
template <typename T>
25
24
class AdaptiveRKVDiversityCalculator {
26
25
public:
27
26
/* * @param start_size Size, in tokens, of the key cache area that will be ignored for purposes of diversity
28
- * calculation, starting from the beginning of the token dimension ("start area"). Must be a multiple of `block_size`.
27
+ * calculation, starting from the beginning of the token dimension ("start area"). Must be a multiple of
28
+ * `block_size`.
29
29
* @param eviction_size Size, in tokens, from the beginning of the start area, the tokens in which will be
30
- * considred for purposes of diversity calculation ("eviction area"). The rest of the tokens after the eviction area,
31
- * if any, are ignored. Must be a multiple of `block_size`.
32
- * @param block_size Block size of the underlying paged attention implementation. The diversity values will be sum-reduced
33
- * from per-token values to per-block values based on this number of tokens in a block.
30
+ * considred for purposes of diversity calculation ("eviction area"). The rest of the tokens after the eviction
31
+ * area, if any, are ignored. Must be a multiple of `block_size`.
32
+ * @param block_size Block size of the underlying paged attention implementation. The diversity values will be
33
+ * sum-reduced from per-token values to per-block values based on this number of tokens in a block.
34
34
* */
35
35
AdaptiveRKVDiversityCalculator (size_t start_size, size_t eviction_size, size_t block_size)
36
36
: m_start_size(start_size),
@@ -46,13 +46,10 @@ class AdaptiveRKVDiversityCalculator {
46
46
* @param in_out_shape Shape of the matrix data. Expected shape is [num_heads, token_dim, token_dim].
47
47
* @param val Value to fill in the diagonal positions.
48
48
*/
49
- void fill_diagonal_ (T* in_out,
50
- const Shape& in_out_shape,
51
- T val) {
52
- OPENVINO_ASSERT (in_out_shape.size () == 3 ); // [num_heads, token_dim, token_dim]
49
+ void fill_diagonal_ (T* in_out, const Shape& in_out_shape, T val) {
50
+ OPENVINO_ASSERT (in_out_shape.size () == 3 ); // [num_heads, token_dim, token_dim]
53
51
OPENVINO_ASSERT (in_out_shape[1 ] == in_out_shape[2 ]); // [num_heads, token_dim, token_dim]
54
52
55
-
56
53
for (size_t head_idx = 0 ; head_idx < in_out_shape[0 ]; head_idx++) {
57
54
size_t in_head_offset = head_idx * in_out_shape[1 ] * in_out_shape[2 ];
58
55
for (size_t token_dim_idx = 0 ; token_dim_idx < in_out_shape[1 ]; token_dim_idx++) {
@@ -63,19 +60,19 @@ class AdaptiveRKVDiversityCalculator {
63
60
}
64
61
}
65
62
66
- /* * For a rank-3 tensor, zeroes out the values that are less than the mean of the values of the corresponding slice at rank 2 (zero-based). Ranks 1 and 2 of the input tensor must be equal. Mean values are computed and provided externally. The operation is done in-place.
63
+ /* * For a rank-3 tensor, zeroes out the values that are less than the mean of the values of the corresponding slice
64
+ * at rank 2 (zero-based). Ranks 1 and 2 of the input tensor must be equal. Mean values are computed and provided
65
+ * externally. The operation is done in-place.
67
66
* @param in_out Pointer to the tensor data.
68
67
* @param in_out_shape Shape of the tensor data. Expected shape is [num_heads, token_dim, token_dim].
69
- * @param means Pointer to the tensor data containing the means of each slice of the `in_out` tensor along its rank 2 (zero-based).
68
+ * @param means Pointer to the tensor data containing the means of each slice of the `in_out` tensor along its rank
69
+ * 2 (zero-based).
70
70
* @param means_shape Shape of the means tensor. Expected shape is [num_heads, token_dim].
71
71
*/
72
- void fill_low_values_with_zeros_ (T* in_out,
73
- const Shape& in_out_shape,
74
- const T* means,
75
- const Shape& means_shape) {
72
+ void fill_low_values_with_zeros_ (T* in_out, const Shape& in_out_shape, const T* means, const Shape& means_shape) {
76
73
OPENVINO_ASSERT (in_out_shape.size () == 3 ); // [num_heads, token_dim, token_dim]
77
74
OPENVINO_ASSERT (in_out_shape[1 ] == in_out_shape[2 ]);
78
- OPENVINO_ASSERT (means_shape.size () == 2 ); // [num_heads, token_dim]
75
+ OPENVINO_ASSERT (means_shape.size () == 2 ); // [num_heads, token_dim]
79
76
OPENVINO_ASSERT (means_shape[0 ] == in_out_shape[0 ]);
80
77
OPENVINO_ASSERT (means_shape[1 ] == in_out_shape[1 ]);
81
78
@@ -96,14 +93,12 @@ class AdaptiveRKVDiversityCalculator {
96
93
97
94
/* * For a square matrix, sums each `block_size`-sized group of matrix rows to produce a row in the output matrix.
98
95
* @param in_data Pointer to the matrix data.
99
- * @param in_shape Shape of the matrix data. Expected shape is [token_dim, token_dim], where token_dim must be a multiple of `block_size`.
96
+ * @param in_shape Shape of the matrix data. Expected shape is [token_dim, token_dim], where token_dim must be a
97
+ * multiple of `block_size`.
100
98
* @param out Pointer to the output matrix data.
101
99
* @param out_shape Shape of the output matrix. Expected shape is [token_dim / block_size, token_dim].
102
100
*/
103
- void block_sum_diversity_values (const T* in_data,
104
- const Shape& in_shape,
105
- T* out,
106
- const Shape& out_shape) {
101
+ void block_sum_diversity_values (const T* in_data, const Shape& in_shape, T* out, const Shape& out_shape) {
107
102
OPENVINO_ASSERT (in_shape.size () == 2 ); // [token_dim, token_dim]
108
103
OPENVINO_ASSERT (in_shape[0 ] == in_shape[1 ]);
109
104
OPENVINO_ASSERT (in_shape[0 ] % m_block_size == 0 );
@@ -117,11 +112,11 @@ class AdaptiveRKVDiversityCalculator {
117
112
for (size_t out_block_dim_idx = 0 ; out_block_dim_idx < out_shape[0 ]; out_block_dim_idx++) {
118
113
size_t out_block_offset = out_block_dim_idx * out_shape[1 ];
119
114
for (size_t out_token_dim_idx = 0 ; out_token_dim_idx < out_shape[1 ]; out_token_dim_idx++) {
120
- size_t in_block_offset = (out_block_dim_idx * m_block_size) * out_shape[1 ];
121
- for (size_t in_token_in_block_idx = 0 ; in_token_in_block_idx < m_block_size; in_token_in_block_idx++) {
122
- size_t source_offset = in_block_offset + in_token_in_block_idx * in_shape[1 ] + out_token_dim_idx;
123
- out[out_block_offset + out_token_dim_idx] -= in_data[source_offset];
124
- }
115
+ size_t in_block_offset = (out_block_dim_idx * m_block_size) * out_shape[1 ];
116
+ for (size_t in_token_in_block_idx = 0 ; in_token_in_block_idx < m_block_size; in_token_in_block_idx++) {
117
+ size_t source_offset = in_block_offset + in_token_in_block_idx * in_shape[1 ] + out_token_dim_idx;
118
+ out[out_block_offset + out_token_dim_idx] -= in_data[source_offset];
119
+ }
125
120
}
126
121
}
127
122
}
@@ -131,37 +126,54 @@ class AdaptiveRKVDiversityCalculator {
131
126
* that the 1-st rank is left unaggregated when compared to the full diversity calculation algorithm. The reason
132
127
* for this is as follows. The final per-block diversity value computation relies on knowing the subset of blocks
133
128
* in the eviction area that will be retained regardless of calculated diversity. This subset must be filtered out
134
- * from the rank-1 dimension when performing reduce-mean in the original algorithm to get 1 diversity value per block
135
- * in the eviction area. Due to implementation specifics the paged attention kernel does not know ahead of time which
136
- * blocks will be "retained" - this information is only available on the openvino.genai level after the PA kernel has executed.
137
- * Therefore the PA kernel will provide raw per-token values on the rank 1 of the returned diversity value matrix and delegatei
138
- * the final reduce-mean and filtering to the openvino.genai level.
129
+ * from the rank-1 dimension when performing reduce-mean in the original algorithm to get 1 diversity value per
130
+ * block in the eviction area. Due to implementation specifics the paged attention kernel does not know ahead of
131
+ * time which blocks will be "retained" - this information is only available on the openvino.genai level after the
132
+ * PA kernel has executed. Therefore the PA kernel will provide raw per-token values on the rank 1 of the returned
133
+ * diversity value matrix and delegatei the final reduce-mean and filtering to the openvino.genai level.
139
134
* @param key_data Pointer to the key cache tensor data
140
135
* @param key_shape Shape of the key input tensor data. Expected shape is [num_heads, num_key_tokens, head_size],
141
136
* where `num_key_tokens` must be no less than `start_size + eviction_size`.
142
- * @return A rank-2 matrix in the std::vector representation with dimensions [eviction_size / block_size, eviction_size] containing
143
- * the diversity values. The values are expected to be further mean-reduced along rank 1 (zero-based) at the point in time when the
144
- * subset of blocks to be exclusively retained is known.
137
+ * @return A rank-2 matrix in the std::vector representation with dimensions [eviction_size / block_size,
138
+ * eviction_size] containing the diversity values. The values are expected to be further mean-reduced along rank 1
139
+ * (zero-based) at the point in time when the subset of blocks to be exclusively retained is known.
145
140
*/
146
- std::vector<std::vector<T>> calculate_block_diversity (const T* key_data,
147
- const Shape& key_shape) {
148
- OPENVINO_ASSERT (key_shape.size () == 3 ); // [num_heads, key_token_len, head_dim]
141
+ std::vector<std::vector<T>> calculate_block_diversity (const T* key_data, const Shape& key_shape) {
142
+ OPENVINO_ASSERT (key_shape.size () == 3 ); // [num_heads, key_token_len, head_dim]
149
143
OPENVINO_ASSERT (key_shape[1 ] >= m_start_size + m_eviction_size);
150
144
151
-
152
145
auto normalized_key_data_buf = allocate_buf (key_shape);
153
146
// Should be safe to use this in-place
154
- ov::reference::normalize_l2 (key_data, normalized_key_data_buf.get (), key_shape, {2 }, std::numeric_limits<float >::epsilon (), ov::op::EpsMode::ADD);
147
+ ov::reference::normalize_l2 (key_data,
148
+ normalized_key_data_buf.get (),
149
+ key_shape,
150
+ {2 },
151
+ std::numeric_limits<float >::epsilon (),
152
+ ov::op::EpsMode::ADD);
155
153
156
154
Shape cos_similar_shape = {key_shape[0 ], key_shape[1 ], key_shape[1 ]};
157
155
auto cos_similar_buf = allocate_buf (cos_similar_shape);
158
- ov::reference::matmul (normalized_key_data_buf.get (), normalized_key_data_buf.get (), cos_similar_buf.get (), key_shape, key_shape, cos_similar_shape, /* transpose_arg0 = */ false , /* transpose_arg1 = */ true );
156
+ ov::reference::matmul (normalized_key_data_buf.get (),
157
+ normalized_key_data_buf.get (),
158
+ cos_similar_buf.get (),
159
+ key_shape,
160
+ key_shape,
161
+ cos_similar_shape,
162
+ /* transpose_arg0 = */ false ,
163
+ /* transpose_arg1 = */ true );
159
164
normalized_key_data_buf.reset ();
160
165
161
166
Shape evictable_subset_shape = {key_shape[0 ], m_eviction_size, m_eviction_size};
162
167
auto evictable_subset_buf = allocate_buf (evictable_subset_shape);
163
168
// stops?
164
- ov::reference::slice (reinterpret_cast <char *>(cos_similar_buf.get ()), cos_similar_shape, reinterpret_cast <char *>(evictable_subset_buf.get ()), evictable_subset_shape, sizeof (T), /* starts = */ {m_start_size, m_start_size}, /* steps = */ {1 , 1 }, /* axes = */ {1 , 2 });
169
+ ov::reference::slice (reinterpret_cast <char *>(cos_similar_buf.get ()),
170
+ cos_similar_shape,
171
+ reinterpret_cast <char *>(evictable_subset_buf.get ()),
172
+ evictable_subset_shape,
173
+ sizeof (T),
174
+ /* starts = */ {m_start_size, m_start_size},
175
+ /* steps = */ {1 , 1 },
176
+ /* axes = */ {1 , 2 });
165
177
cos_similar_buf.reset ();
166
178
167
179
fill_diagonal_ (evictable_subset_buf.get (), evictable_subset_shape, 0.0 );
@@ -175,12 +187,18 @@ class AdaptiveRKVDiversityCalculator {
175
187
176
188
Shape aggregated_token_similarities_shape = {m_eviction_size, m_eviction_size};
177
189
auto aggregated_token_similarities_buf = allocate_buf (aggregated_token_similarities_shape);
178
- ov::reference::reduce_mean (evictable_subset_buf.get (), aggregated_token_similarities_buf.get (), evictable_subset_shape, {0 });
190
+ ov::reference::reduce_mean (evictable_subset_buf.get (),
191
+ aggregated_token_similarities_buf.get (),
192
+ evictable_subset_shape,
193
+ {0 });
179
194
evictable_subset_buf.reset ();
180
195
181
196
Shape block_diversity_shape = {m_eviction_size / m_block_size, m_eviction_size};
182
197
auto block_diversity_buf = allocate_buf (block_diversity_shape);
183
- block_sum_diversity_values (aggregated_token_similarities_buf.get (), aggregated_token_similarities_shape, block_diversity_buf.get (), block_diversity_shape);
198
+ block_sum_diversity_values (aggregated_token_similarities_buf.get (),
199
+ aggregated_token_similarities_shape,
200
+ block_diversity_buf.get (),
201
+ block_diversity_shape);
184
202
std::vector<std::vector<T>> retval (block_diversity_shape[0 ], std::vector<T>(block_diversity_shape[1 ]));
185
203
for (size_t block_idx = 0 ; block_idx < block_diversity_shape[0 ]; block_idx++) {
186
204
for (size_t token_idx = 0 ; token_idx < block_diversity_shape[1 ]; token_idx++) {
@@ -199,7 +217,6 @@ class AdaptiveRKVDiversityCalculator {
199
217
return std::shared_ptr<T[]>(new T[ov::shape_size (shape)]);
200
218
}
201
219
202
-
203
220
size_t m_start_size;
204
221
size_t m_eviction_size;
205
222
size_t m_block_size;
0 commit comments