From b539b481f23d39fc857551763b4779eebeaf368f Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Mon, 6 Oct 2025 14:52:39 +0200 Subject: [PATCH 1/8] Add Adaptive R-KV reference op implementation --- .../reference/adaptive_rkv_diversity.hpp | 204 ++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp diff --git a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp new file mode 100644 index 00000000000000..e8065409da203f --- /dev/null +++ b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp @@ -0,0 +1,204 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include "openvino/reference/matmul.hpp" +#include "openvino/reference/normalize_l2.hpp" +#include "openvino/reference/reduce_mean.hpp" +#include "openvino/reference/slice.hpp" +#include "openvino/runtime/tensor.hpp" + +namespace ov::reference { + + +/** @brief Reference implementation of the XAttention sparse attention prefill mechanism + * (https://arxiv.org/abs/2503.16428) */ +template +class AdaptiveRKVDiversityCalculator { +public: + /** @param threshold Defines a threshold for introduced block sparsity - XAttention attempts to preserve the + * smallest subset of attention score matrix blocks so that the ratio of the attention score sum to the total sum of + * attention score matrix elements is no less than `threshold`. In other words, `threshold` defines a fraction of + * the attention score mass which is to be preserved by most "important" blocks. Valid range is 0.0-1.0, with 0.0 + * corresponding to 0% of the blocks retained, and 1.0 corresponding to 100% of the blocks retained. + * @param block_size The size of blocks into which the attention score matrix [num_heads, query_token_dimension, + * key_token_dimension] will be subdivided for purposes of determining the subset of the most important blocks + * according to `threshold`. This subdivision occurs on query and key dimensions of the attention score matrix with + * the same granularity, i.e. the resulting blocks have equal size on both dimensions. Essentially `block_size` + * defines the granularity of the eventual sparse attention computations. Must be a multiple of `stride`. + * @param stride The stride at which the full attention matrix is subsampled in a block-antidiagonal fashion to + * estimate the block importance. Note that the full attention matrix is not computed, instead the original query + * and key matrices are reshaped appropriately so that only the necessary elements are computed. Ideally, the + * computational complexity of the entire block estimation operation is `stride` times lower than the full attention + * matrix computation. + * */ + AdaptiveRKVDiversityCalculator(size_t start_size, size_t eviction_size, size_t block_size) + : m_start_size(start_size), + m_eviction_size(eviction_size), + m_block_size(block_size) { + OPENVINO_ASSERT(start_size % block_size == 0); + OPENVINO_ASSERT(eviction_size % block_size == 0); + } + + /** Divides the input rank-3 tensor into blocks along last two dimensions, performs the addition of the values + * inside each block and outputs each block sum into corresponding positions in the output tensor downsampled along + * the same dimensions. The output tensor dimensions are such that the query and key token dimensions are + * downsampled by `block_size` when compared to the *original* query and key tensors. + * @param attention_scores_data Pointer to the attention score input. + * @param attention_score_shape Shape of the attention score input tensor. Expected shape is [num_heads, + * num_query_tokens / stride, num_key_tokens / stride], where `num_query_tokens` and `num_key_tokens` must be + * multiples of `block_size`. + * @param out Pointer to the output tensor data (block sums) + * @param out_shape Shape of the output tensor data. Expected shape is [num_heads, num_query_tokens / block_size, + * num_key_tokens / block_size]. + */ + void fill_diagonal_(const T* in_out, + const Shape& in_out_shape, + T val) { + OPENVINO_ASSERT(in_out_shape.size() == 3); // [num_heads, token_dim, token_dim] + OPENVINO_ASSERT(in_out_shape[1] == in_out_shape[2]); // [num_heads, token_dim, token_dim] + + + for (size_t head_idx = 0; head_idx < in_out_shape[0]; head_idx++) { + size_t in_head_offset = head_idx * in_out_shape[1] * in_out_shape[2]; + for (size_t token_dim_idx = 0; token_dim_idx < in_out_shape[1]; token_dim_idx++) { + size_t diagonal_element_offset = token_dim_idx + token_dim_idx * in_out_shape[1]; + auto diagonal_element_ptr = in_out + in_head_offset + diagonal_element_offset; + *diagonal_element_ptr = val; + } + } + } + + void fill_low_values_with_zeros_(const T* in_out, + const Shape& in_out_shape, + const T* means, + const Shape& means_shape) { + OPENVINO_ASSERT(in_out_shape.size() == 3); // [num_heads, token_dim, token_dim] + OPENVINO_ASSERT(in_out_shape[1] == in_out_shape[2]); + OPENVINO_ASSERT(means_shape.size() == 2); // [num_heads, token_dim] + OPENVINO_ASSERT(means_shape[0] == in_out_shape[0]); + OPENVINO_ASSERT(means_shape[1] == in_out_shape[1]); + + for (size_t head_idx = 0; head_idx < in_out_shape[0]; head_idx++) { + size_t in_head_offset = head_idx * in_out_shape[1] * in_out_shape[2]; + size_t means_head_offset = head_idx * means_shape[1]; + for (size_t token_dim_idx = 0; token_dim_idx < in_out_shape[1]; token_dim_idx++) { + T mean_val = means[means_head_offset + token_dim_idx]; + size_t token_offset = token_dim_idx * in_out_shape[2]; + for (size_t reduced_dim_idx = 0; reduced_dim_idx < in_out_shape[2]; reduced_dim_idx++) { + size_t target_offset = in_head_offset + token_offset + reduced_dim_idx; + T filled_val = in_out[target_offset]; + in_out[target_offset] = filled_val >= mean_val ? filled_val : 0.0; + } + } + } + } + + void block_sum_diversity_values(const T* processed_similarity_token_data, + const Shape& processed_similarity_token_data_shape, + T* out, + const Shape& out_shape) { + OPENVINO_ASSERT(processed_similarity_token_data_shape.size() == 2); // [token_dim, token_dim] + OPENVINO_ASSERT(processed_similarity_token_data_shape[0] == processed_similarity_token_data_shape[1]); + OPENVINO_ASSERT(processed_similarity_token_data_shape[0] % m_block_size == 0); + + OPENVINO_ASSERT(out_shape.size() == 2); // [block_dim, token_dim] + OPENVINO_ASSERT(out_shape[0] == processed_similarity_token_data_shape[0] / m_block_size); + OPENVINO_ASSERT(out_shape[1] == processed_similarity_token_data_shape[1]); + + std::memset(out, 0, out_shape[0] * out_shape[1] * sizeof(T)); + + for (size_t out_block_dim_idx = 0; out_block_dim_idx < out_shape[0]; out_block_dim_idx++) { + size_t out_block_offset = out_block_dim_idx * out_shape[1]; + for (size_t out_token_dim_idx = 0; out_token_dim_idx < out_shape[1]; out_token_dim_idx++) { + size_t in_block_offset = (out_block_dim_idx * m_block_size) * out_shape[1]; + for (size_t in_token_in_block_idx = 0; in_token_in_block_idx < m_block_size; in_token_in_block_idx++) { + size_t source_offset = in_block_offset + in_token_in_block_idx * processed_similarity_token_data_shape[1] + out_token_dim_idx; + out[out_block_offset + out_token_dim_idx] += processed_similarity_token_data[source_offset]; + } + } + } + } + + /** Applies XAttention to the provided query and key matrices, returning the subset of the most important blocks for + * each attention head, according to the configured block size and threshold, which are to be preserved in the + * subsequent sparse attention computation. + * @param query_data Pointer to the query input tensor data + * @param query_shape Shape of the query input tensor data. Expected shape is [num_heads, num_query_tokens, + * head_size], where `num_query_tokens` must be a multiple of both `block_size` and `stride`, padded with zeroes if + * necessary to do so in the real-world scenario. + * @param key_data Pointer to the key input tensor data + * @param key_shape Shape of the key input tensor data. Expected shape is [num_heads, num_key_tokens, head_size], + * where `num_key_tokens` must be a multiple of both `block_size` and `stride`, padded with zeroes if necessary to + * do so in the real-world scenario. + * @return A vector of size `num_heads` of sets, each set containing pairs of block indices (.first is the block + * index along the query dimension, .second - along the key). Each set is the head-specific subset of blocks that + * must be preserved in the sparse attention computation. Indices are given in units of XAttention-specific + * `block_size` (as configured), which may differ from the block size in the paged attention implementation. + */ + std::vector> calculate_block_diversity(const T* key_data, + const Shape& key_shape) { + OPENVINO_ASSERT(key_shape.size() == 3); // [num_heads, key_token_len, head_dim] + OPENVINO_ASSERT(key_shape[1] >= m_block_size * (m_start_size + m_eviction_size)); + + // Should be safe to use this in-place + ov::reference::normalize_l2(key_data, key_data, key_shape, {2}, std::numeric_limits::epsilon()); + + Shape cos_similar_shape = {key_shape[0], key_shape[1], key_shape[1]}; + auto cos_similar_buf = allocate_buf(cos_similar_shape); + ov::reference::matmul(key_data, key_data, cos_similar_buf.get(), key_shape, key_shape, cos_similar_shape, /* transpose_arg0 = */ false, /* transpose_arg1 = */ true); + + Shape evictable_subset_shape = {key_shape[0], m_eviction_size, m_eviction_size}; + auto evictable_subset_buf = allocate_buf(evictable_subset_shape); + // stops? + ov::reference::slice(cos_similar_buf.get(), cos_similar_shape, evictable_subset_buf.get(), evictable_subset_shape, sizeof(T), /* starts = */ {m_start_size, m_start_size}, /* steps = */ {1, 1}, /* axes = */{1, 2}); + cos_similar_buf.reset(); + + fill_diagonal_(evictable_subset_buf.get(), evictable_subset_shape, 0.0); + + Shape means_shape = {key_shape[0], m_eviction_size}; + auto means_buf = allocate_buf(means_shape); + ov::reference::reduce_mean(evictable_subset_buf.get(), means_buf.get(), evictable_subset_shape, {2}); + + fill_low_values_with_zeros_(evictable_subset_buf.get(), evictable_subset_shape, means_buf.get(), means_shape); + + Shape aggregated_token_similarities_shape = {m_eviction_size, m_eviction_size}; + auto aggregated_token_similarities_buf = allocate_buf(aggregated_token_similarities_shape); + ov::reference::reduce_mean(evictable_subset_buf.get(), aggregated_token_similarities_buf.get(), evictable_subset_shape, {0}); + evictable_subset_buf.reset(); + + Shape block_diversity_shape = {m_eviction_size / m_block_size, m_eviction_size}; + auto block_diversity_buf = allocate_buf(block_diversity_shape); + block_sum_diversity_values(aggregated_token_similarities_buf.get(), aggregated_token_similarities_shape, block_diversity_buf.get(), block_diversity_shape); + std::vector> retval(block_diversity_shape[0], std::vector(block_diversity_shape[1])); + for (size_t block_idx = 0; block_idx < block_diversity_shape[0]; block_idx++) { + for (size_t token_idx = 0; token_idx < block_diversity_shape[1]; token_idx++) { + retval[block_idx][token_idx] = block_diversity_buf.get() + block_idx * block_diversity_shape[1] + token_idx; + } + } + + return retval; + } + + /** + * @param shape Shape of a tensor + * @return A shared_ptr owning a buffer that can be used to store tensor data for the given shape. + * */ + std::shared_ptr allocate_buf(const Shape& shape) { + return std::shared_ptr(new T[ov::shape_size(shape)]); + } + + + size_t m_start_size; + size_t m_eviction_size; + size_t m_block_size; +}; + +} // namespace ov::reference From 5a71b4e93daa2db5129a66750008462eed3ca7f1 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Tue, 7 Oct 2025 13:32:03 +0200 Subject: [PATCH 2/8] Add basic tests --- .../reference/adaptive_rkv_diversity.hpp | 21 +- .../reference/adaptive_rkv_diversity.cpp | 443 ++++++++++++++++++ 2 files changed, 456 insertions(+), 8 deletions(-) create mode 100644 src/core/tests/reference/adaptive_rkv_diversity.cpp diff --git a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp index e8065409da203f..b3f4f755266279 100644 --- a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp +++ b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp @@ -9,6 +9,7 @@ #include #include +#include "openvino/op/util/attr_types.hpp" #include "openvino/reference/matmul.hpp" #include "openvino/reference/normalize_l2.hpp" #include "openvino/reference/reduce_mean.hpp" @@ -59,7 +60,7 @@ class AdaptiveRKVDiversityCalculator { * @param out_shape Shape of the output tensor data. Expected shape is [num_heads, num_query_tokens / block_size, * num_key_tokens / block_size]. */ - void fill_diagonal_(const T* in_out, + void fill_diagonal_(T* in_out, const Shape& in_out_shape, T val) { OPENVINO_ASSERT(in_out_shape.size() == 3); // [num_heads, token_dim, token_dim] @@ -76,7 +77,7 @@ class AdaptiveRKVDiversityCalculator { } } - void fill_low_values_with_zeros_(const T* in_out, + void fill_low_values_with_zeros_(T* in_out, const Shape& in_out_shape, const T* means, const Shape& means_shape) { @@ -121,7 +122,7 @@ class AdaptiveRKVDiversityCalculator { size_t in_block_offset = (out_block_dim_idx * m_block_size) * out_shape[1]; for (size_t in_token_in_block_idx = 0; in_token_in_block_idx < m_block_size; in_token_in_block_idx++) { size_t source_offset = in_block_offset + in_token_in_block_idx * processed_similarity_token_data_shape[1] + out_token_dim_idx; - out[out_block_offset + out_token_dim_idx] += processed_similarity_token_data[source_offset]; + out[out_block_offset + out_token_dim_idx] -= processed_similarity_token_data[source_offset]; } } } @@ -146,19 +147,22 @@ class AdaptiveRKVDiversityCalculator { std::vector> calculate_block_diversity(const T* key_data, const Shape& key_shape) { OPENVINO_ASSERT(key_shape.size() == 3); // [num_heads, key_token_len, head_dim] - OPENVINO_ASSERT(key_shape[1] >= m_block_size * (m_start_size + m_eviction_size)); + OPENVINO_ASSERT(key_shape[1] >= m_start_size + m_eviction_size); + + auto normalized_key_data_buf = allocate_buf(key_shape); // Should be safe to use this in-place - ov::reference::normalize_l2(key_data, key_data, key_shape, {2}, std::numeric_limits::epsilon()); + ov::reference::normalize_l2(key_data, normalized_key_data_buf.get(), key_shape, {2}, std::numeric_limits::epsilon(), ov::op::EpsMode::ADD); Shape cos_similar_shape = {key_shape[0], key_shape[1], key_shape[1]}; auto cos_similar_buf = allocate_buf(cos_similar_shape); - ov::reference::matmul(key_data, key_data, cos_similar_buf.get(), key_shape, key_shape, cos_similar_shape, /* transpose_arg0 = */ false, /* transpose_arg1 = */ true); + ov::reference::matmul(normalized_key_data_buf.get(), normalized_key_data_buf.get(), cos_similar_buf.get(), key_shape, key_shape, cos_similar_shape, /* transpose_arg0 = */ false, /* transpose_arg1 = */ true); + normalized_key_data_buf.reset(); Shape evictable_subset_shape = {key_shape[0], m_eviction_size, m_eviction_size}; auto evictable_subset_buf = allocate_buf(evictable_subset_shape); // stops? - ov::reference::slice(cos_similar_buf.get(), cos_similar_shape, evictable_subset_buf.get(), evictable_subset_shape, sizeof(T), /* starts = */ {m_start_size, m_start_size}, /* steps = */ {1, 1}, /* axes = */{1, 2}); + ov::reference::slice(reinterpret_cast(cos_similar_buf.get()), cos_similar_shape, reinterpret_cast(evictable_subset_buf.get()), evictable_subset_shape, sizeof(T), /* starts = */ {m_start_size, m_start_size}, /* steps = */ {1, 1}, /* axes = */{1, 2}); cos_similar_buf.reset(); fill_diagonal_(evictable_subset_buf.get(), evictable_subset_shape, 0.0); @@ -168,6 +172,7 @@ class AdaptiveRKVDiversityCalculator { ov::reference::reduce_mean(evictable_subset_buf.get(), means_buf.get(), evictable_subset_shape, {2}); fill_low_values_with_zeros_(evictable_subset_buf.get(), evictable_subset_shape, means_buf.get(), means_shape); + means_buf.reset(); Shape aggregated_token_similarities_shape = {m_eviction_size, m_eviction_size}; auto aggregated_token_similarities_buf = allocate_buf(aggregated_token_similarities_shape); @@ -180,7 +185,7 @@ class AdaptiveRKVDiversityCalculator { std::vector> retval(block_diversity_shape[0], std::vector(block_diversity_shape[1])); for (size_t block_idx = 0; block_idx < block_diversity_shape[0]; block_idx++) { for (size_t token_idx = 0; token_idx < block_diversity_shape[1]; token_idx++) { - retval[block_idx][token_idx] = block_diversity_buf.get() + block_idx * block_diversity_shape[1] + token_idx; + retval[block_idx][token_idx] = block_diversity_buf[block_idx * block_diversity_shape[1] + token_idx]; } } diff --git a/src/core/tests/reference/adaptive_rkv_diversity.cpp b/src/core/tests/reference/adaptive_rkv_diversity.cpp new file mode 100644 index 00000000000000..6595162e1aa3ec --- /dev/null +++ b/src/core/tests/reference/adaptive_rkv_diversity.cpp @@ -0,0 +1,443 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include + +namespace adaptive_rkv_test { +size_t DEFAULT_BLOCK_SIZE = 2; +size_t DEFAULT_START_SIZE = 2; +size_t DEFAULT_EVICTION_SIZE = 10; + + +TEST(AdaptiveRKVE2ESmokeTest, CalculatesDiversityWithoutThrowing) { + ov::reference::AdaptiveRKVDiversityCalculator calculator(DEFAULT_START_SIZE, + DEFAULT_EVICTION_SIZE, + DEFAULT_BLOCK_SIZE); + + ov::Shape mock_shape{2, (DEFAULT_START_SIZE + DEFAULT_EVICTION_SIZE) * 2, 8}; + std::vector mock_data(ov::shape_size(mock_shape), 1.0); + + EXPECT_NO_THROW(calculator.calculate_block_diversity(mock_data.data(), mock_shape)); +}; + + +struct FillDiagonalTestData { + ov::Shape in_shape; + std::vector in_data; + std::vector ref_out_data; +}; + +using AdaptiveRKVDiversityFillDiagonalTest = ::testing::TestWithParam; + +std::vector FILL_DIAGONAL_TEST_CASES = { + { + {2, 4, 4}, + // clang-format off + { + 3.144, 8.512, 8.518, -8.386, + 7.889, -5.721, 5.507, 4.295, + -6.624, -8.463, 7.474, 9.879, + 4.534, -5.908, -9.388, 2.356, + + 7.497, 8.186, -8.658, -4.796, + -8.248, -9.797, -7.907, -4.513, + 3.469, 7.633, 7.244, -6.844, + -7.173, 4.450, 6.705, -7.035 + }, + // clang-format on + + // clang-format off + { + 42.00, 8.512, 8.518, -8.386, + 7.889, 42.00, 5.507, 4.295, + -6.624, -8.463, 42.00, 9.879, + 4.534, -5.908, -9.388, 42.00, + + 42.00, 8.186, -8.658, -4.796, + -8.248, 42.00, -7.907, -4.513, + 3.469, 7.633, 42.00, -6.844, + -7.173, 4.450, 6.705, 42.00 + }, + // clang-format on + } +}; + +TEST_P(AdaptiveRKVDiversityFillDiagonalTest, FillsDiagonal) { + auto test_struct = GetParam(); + ASSERT_EQ(test_struct.in_data.size(), ov::shape_size(test_struct.in_shape)); + ASSERT_EQ(test_struct.ref_out_data.size(), ov::shape_size(test_struct.in_shape)); + + ov::reference::AdaptiveRKVDiversityCalculator calculator(DEFAULT_START_SIZE, DEFAULT_EVICTION_SIZE, DEFAULT_BLOCK_SIZE); + + std::vector test_out_data = test_struct.in_data; + calculator.fill_diagonal_(test_out_data.data(), + test_struct.in_shape, + 42.0); + EXPECT_EQ(test_out_data, test_struct.ref_out_data); +} + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + AdaptiveRKVDiversityFillDiagonalTest, + ::testing::ValuesIn(FILL_DIAGONAL_TEST_CASES)); + +struct FillLowValuesWithZerosTestData { + ov::Shape in_shape; + std::vector in_data; + ov::Shape means_shape; + std::vector means; + std::vector ref_out_data; +}; + +using AdaptiveRKVFillLowValuesWithZerosTest = ::testing::TestWithParam; + +std::vector FILL_LOW_VALUES_WITH_ZEROS_TEST_CASES = { + { + {2, 4, 4}, + // clang-format off + { + 4.534, -5.908, -9.388, 2.356, + -6.624, -8.463, 7.474, 9.879, + 7.889, -5.721, 5.507, 4.295, + 3.144, 8.512, 8.518, -8.386, + + -7.173, 4.450, 6.705, -7.035, + 3.469, 7.633, 7.244, -6.844, + -8.248, -9.797, -7.907, -4.513, + 7.497, 8.186, -8.658, -4.796, + }, + // clang-format on + + {2, 4}, + + // clang-format off + { + -2.1015, + 0.5665, + 2.9925, + 2.947, + + -0.76325, + 2.8755, + -7.61625, + 0.55725 + }, + + // clang-format off + { + 4.534, 0.000, 0.000, 2.356, + 0.000, 0.000, 7.474, 9.879, + 7.889, 0.000, 5.507, 4.295, + 3.144, 8.512, 8.518, 0.000, + + 0.000, 4.450, 6.705, 0.000, + 3.469, 7.633, 7.244, 0.000, + 0.000, 0.000, 0.000, -4.513, + 7.497, 8.186, 0.000, 0.000, + }, + // clang-format on + }, +}; + +TEST_P(AdaptiveRKVFillLowValuesWithZerosTest, FillsLowValuesWithZero) { + auto test_struct = GetParam(); + ASSERT_EQ(test_struct.in_data.size(), ov::shape_size(test_struct.in_shape)); + ASSERT_EQ(test_struct.means.size(), ov::shape_size(test_struct.means_shape)); + ASSERT_EQ(test_struct.ref_out_data.size(), ov::shape_size(test_struct.in_shape)); + + ov::reference::AdaptiveRKVDiversityCalculator calculator(DEFAULT_START_SIZE, + DEFAULT_EVICTION_SIZE, + DEFAULT_BLOCK_SIZE); + std::vector test_out_data = test_struct.in_data; + calculator.fill_low_values_with_zeros_(test_out_data.data(), test_struct.in_shape, test_struct.means.data(), test_struct.means_shape); + + EXPECT_THAT(test_out_data, ::testing::Pointwise(::testing::DoubleNear(1e-8), test_struct.ref_out_data)); +} + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + AdaptiveRKVFillLowValuesWithZerosTest, + ::testing::ValuesIn(FILL_LOW_VALUES_WITH_ZEROS_TEST_CASES)); + + +struct BlockSumTestData { + ov::Shape in_shape; + std::vector in_data; + size_t block_size; + ov::Shape out_shape; + std::vector ref_out_data; +}; + +using AdaptiveRKVBlockSumTest = ::testing::TestWithParam; + +std::vector BLOCK_SUM_TEST_CASES = { + { + {8, 8}, + // clang-format off + { + 0.1117, 0.0780, 0.1347, 0.0885, 0.1942, 0.0922, 0.1184, 0.1824, + 0.1488, 0.1766, 0.0852, 0.1239, 0.0930, 0.1220, 0.1367, 0.1138, + 0.1410, 0.0861, 0.0774, 0.1325, 0.1478, 0.1689, 0.0885, 0.1579, + 0.1248, 0.1038, 0.1842, 0.0935, 0.1813, 0.0890, 0.0897, 0.1336, + 0.0905, 0.1049, 0.1263, 0.0953, 0.1018, 0.1297, 0.1659, 0.1855, + 0.1373, 0.1791, 0.1005, 0.1286, 0.1492, 0.1373, 0.0820, 0.0860, + 0.0997, 0.1285, 0.0786, 0.1366, 0.1963, 0.0904, 0.1488, 0.1211, + 0.1859, 0.1174, 0.1364, 0.0930, 0.1028, 0.1034, 0.1699, 0.0912 + }, + // clang-format on + + /* block_size = */ 2, + + {4, 8}, + + // clang-format off + { + -0.2605, -0.2546, -0.2199, -0.2124, -0.2872, -0.2142, -0.2551, -0.2962, + -0.2658, -0.1899, -0.2616, -0.226, -0.3291, -0.2579, -0.1782, -0.2915, + -0.2278, -0.284 , -0.2268, -0.2239, -0.251, -0.267, -0.2479, -0.2715, + -0.2856, -0.2459, -0.215, -0.2296, -0.2991, -0.1938, -0.3187, -0.2123 + + }, + }, +}; + +TEST_P(AdaptiveRKVBlockSumTest, BlockSumIsCorrect) { + auto test_struct = GetParam(); + ASSERT_EQ(test_struct.in_data.size(), ov::shape_size(test_struct.in_shape)); + ASSERT_EQ(test_struct.ref_out_data.size(), ov::shape_size(test_struct.out_shape)); + + ov::reference::AdaptiveRKVDiversityCalculator calculator(DEFAULT_START_SIZE, + DEFAULT_EVICTION_SIZE, + test_struct.block_size); + std::vector test_out_data(test_struct.ref_out_data.size()); + calculator.block_sum_diversity_values(test_struct.in_data.data(), test_struct.in_shape, test_out_data.data(), test_struct.out_shape); + + EXPECT_THAT(test_out_data, ::testing::Pointwise(::testing::DoubleNear(1e-5), test_struct.ref_out_data)); +} + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + AdaptiveRKVBlockSumTest, + ::testing::ValuesIn(BLOCK_SUM_TEST_CASES)); + +struct DiversityCalculateTestData { + ov::Shape in_shape; + std::vector in_data; + double threshold; + +}; + +struct E2EDiversityTestData { + ov::Shape k_shape; + std::vector k_data; + size_t start_size; + size_t eviction_size; + std::vector> ref_diversity_data; +}; + +using AdaptiveRKVE2EDiversityTest = ::testing::TestWithParam; + +std::vector E2E_DIVERSITY_TEST_CASES = { + // basic + { + {1, 4, 1}, + // clang-format off + { + 1.0, + 1.0, + 1.0, + 1.0 + }, + /* start_size = */ 2, + /* eviction_size = */ 2, + {{-1.0, -1.0}} + }, + // larger basic + { + {1, 6, 1}, + // clang-format off + { + 6.5, + -11.0, + 1.0, + 1.0, + 1.0, + 1.0, + }, + /* start_size = */ 2, + /* eviction_size = */ 4, + {{-1.0, -1.0, -2.0, -2.0}, + {-2.0, -2.0, -1.0, -1.0}} + }, + // two heads basic + { + {2, 8, 1}, + // clang-format off + { + 6.5, + -11.0, + 1.0, + 1.0, + 1.0, + 1.0, + 42.0, + -13.7, + + 1337.0, + -1256.9, + -1.0, + -1.0, + -1.0, + -1.0, + 0.2, + 0.0 + }, + /* start_size = */ 2, + /* eviction_size = */ 4, + {{-1.0, -1.0, -2.0, -2.0}, + {-2.0, -2.0, -1.0, -1.0}} + }, + // zeroed second head (where it matters) + { + {2, 8, 1}, + // clang-format off + { + 6.5, + -11.0, + 1.0, + 1.0, + 1.0, + 1.0, + 42.0, + -13.7, + + 1337.0, + -1256.9, + 0.0, + 0.0, + 0.0, + 0.0, + 0.2, + 0.0 + }, + /* start_size = */ 2, + /* eviction_size = */ 4, + {{-0.5, -0.5, -1.0, -1.0}, + {-1.0, -1.0, -0.5, -0.5}} + }, + // more embedding dimensions + { + {2, 8, 4}, + // clang-format off + { + 6.5, 8.3, 5.1, -7.4, + -11.0, 1.9, 7.1, 4.8, + 8.0, 8.0, 8.0, 8.0, + 8.0, 8.0, 8.0, 8.0, + 8.0, 8.0, 8.0, 8.0, + 8.0, 8.0, 8.0, 8.0, + 42.0, -41.7, 8.3, 1.0, + -13.7, 0.0, 0.0, 15.1, + + 1337.0, -1.9, -1.4, 475.1, + -1256.9, 1.0, 789.0, 1421.3, + -2.0, -2.0, -2.0, -2.0, + -2.0, -2.0, -2.0, -2.0, + -2.0, -2.0, -2.0, -2.0, + -2.0, -2.0, -2.0, -2.0, + 0.2, -81.3, 74.3, -641.1, + 0.0, 14.7, 98.1, -27.7 + }, + /* start_size = */ 2, + /* eviction_size = */ 4, + {{-1.0, -1.0, -2.0, -2.0}, + {-2.0, -2.0, -1.0, -1.0}} + }, + // orthogonal tokens + { + {2, 8, 4}, + // clang-format off + { + 6.5, 8.3, 5.1, -7.4, + -11.0, 1.9, 7.1, 4.8, + 8.0, 0.0, 0.0, 0.0, + 0.0, 0.0, -18.0, 0.0, + 0.0, 0.0, 0.0, 0.1, + 0.0, 1288.0, 0.0, 0.0, + 42.0, -41.7, 8.3, 1.0, + -13.7, 0.0, 0.0, 15.1, + + 1337.0, -1.9, -1.4, 475.1, + -1256.9, 1.0, 789.0, 1421.3, + 0.0, 0.0, 2.0, 0.0, + 0.0, -12.0, 0.0, 0.0, + 12.8, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 65.5, + 0.2, -81.3, 74.3, -641.1, + 0.0, 14.7, 98.1, -27.7 + }, + /* start_size = */ 2, + /* eviction_size = */ 4, + {{0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0}} + }, + // random excel-checked golden + { + {2, 10, 4}, + // clang-format off + { + 4.949, -7.294, -6.330, 3.757, + -3.561, 1.029, 5.030, -9.483, + 5.350, -2.745, -1.404, -7.788, + -1.086, 4.576, -8.726, -8.815, + 3.144, 8.512, 8.518, -8.386, + 7.889, -5.721, 5.507, 4.295, + -6.624, -8.463, 7.474, 9.879, + 4.534, -5.908, -9.388, 2.356, + 7.497, 8.186, -8.658, -4.796, + -8.248, -9.797, -7.907, -4.513, + + 3.469, 7.633, 7.244, -6.844, + -7.173, 4.450, 6.705, -7.035, + 8.773, -7.571, -9.878, -9.584, + 0.807, 8.059, -7.172, 4.303, + -3.323, -8.852, 1.167, -1.126, + -4.428, 9.678, -6.547, 0.037, + -8.152, -9.865, 3.694, -7.650, + 0.359, 8.018, -7.152, -6.242, + -9.120, -7.228, -9.186, 3.202, + -9.304, -0.401, -5.287, 6.834 + }, + // clang-format on + + /* start_size = */ 2, + /* eviction_size = */ 6, + { + {-0.237145, -0.237145, -0.352696, -0.487902, -0.072365, -0.707192}, + {-0.334657, -0.505941, 0, 0.036135, -0.634881,-0.490221}, + {-0.380811, -0.398746801, -0.432080003, -0.693021748, 0, 0.067216441} + }, + } +}; + +TEST_P(AdaptiveRKVE2EDiversityTest, CalculatesDiversityCorrectly) { + auto test_struct = GetParam(); + ov::reference::AdaptiveRKVDiversityCalculator calculator(test_struct.start_size, + test_struct.eviction_size, + DEFAULT_BLOCK_SIZE); + + auto test_diversity = calculator.calculate_block_diversity(test_struct.k_data.data(), test_struct.k_shape); + ASSERT_EQ(test_diversity.size(), test_struct.ref_diversity_data.size()); + for (size_t i = 0; i < test_diversity.size(); i++) { + ASSERT_EQ(test_diversity[i].size(), test_struct.ref_diversity_data[i].size()); + } + + for (size_t i = 0; i < test_diversity.size(); i++) { + EXPECT_THAT(test_diversity[i], ::testing::Pointwise(::testing::DoubleNear(1e-6), test_struct.ref_diversity_data[i])); + } + +}; + +INSTANTIATE_TEST_SUITE_P(VariousInputs, AdaptiveRKVE2EDiversityTest, ::testing::ValuesIn(E2E_DIVERSITY_TEST_CASES)); +} From c94e619a543ca993c517b2bc3fad89965fb9dd5f Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Wed, 8 Oct 2025 13:00:55 +0200 Subject: [PATCH 3/8] Add documentation --- .../reference/adaptive_rkv_diversity.hpp | 101 +++++++++--------- 1 file changed, 50 insertions(+), 51 deletions(-) diff --git a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp index b3f4f755266279..2fbc906473dbcd 100644 --- a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp +++ b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp @@ -19,26 +19,18 @@ namespace ov::reference { -/** @brief Reference implementation of the XAttention sparse attention prefill mechanism - * (https://arxiv.org/abs/2503.16428) */ +/** @brief Reference implementation of the Adaptive R-KV token diversity calculation mechanism + * (https://arxiv.org/pdf/2505.24133v3) */ template class AdaptiveRKVDiversityCalculator { public: - /** @param threshold Defines a threshold for introduced block sparsity - XAttention attempts to preserve the - * smallest subset of attention score matrix blocks so that the ratio of the attention score sum to the total sum of - * attention score matrix elements is no less than `threshold`. In other words, `threshold` defines a fraction of - * the attention score mass which is to be preserved by most "important" blocks. Valid range is 0.0-1.0, with 0.0 - * corresponding to 0% of the blocks retained, and 1.0 corresponding to 100% of the blocks retained. - * @param block_size The size of blocks into which the attention score matrix [num_heads, query_token_dimension, - * key_token_dimension] will be subdivided for purposes of determining the subset of the most important blocks - * according to `threshold`. This subdivision occurs on query and key dimensions of the attention score matrix with - * the same granularity, i.e. the resulting blocks have equal size on both dimensions. Essentially `block_size` - * defines the granularity of the eventual sparse attention computations. Must be a multiple of `stride`. - * @param stride The stride at which the full attention matrix is subsampled in a block-antidiagonal fashion to - * estimate the block importance. Note that the full attention matrix is not computed, instead the original query - * and key matrices are reshaped appropriately so that only the necessary elements are computed. Ideally, the - * computational complexity of the entire block estimation operation is `stride` times lower than the full attention - * matrix computation. + /** @param start_size Size, in tokens, of the key cache area that will be ignored for purposes of diversity + * calculation, starting from the beginning of the token dimension ("start area"). Must be a multiple of `block_size`. + * @param eviction_size Size, in tokens, from the beginning of the start area, the tokens in which will be + * considred for purposes of diversity calculation ("eviction area"). The rest of the tokens after the eviction area, + * if any, are ignored. Must be a multiple of `block_size`. + * @param block_size Block size of the underlying paged attention implementation. The diversity values will be sum-reduced + * from per-token values to per-block values based on this number of tokens in a block. * */ AdaptiveRKVDiversityCalculator(size_t start_size, size_t eviction_size, size_t block_size) : m_start_size(start_size), @@ -48,17 +40,11 @@ class AdaptiveRKVDiversityCalculator { OPENVINO_ASSERT(eviction_size % block_size == 0); } - /** Divides the input rank-3 tensor into blocks along last two dimensions, performs the addition of the values - * inside each block and outputs each block sum into corresponding positions in the output tensor downsampled along - * the same dimensions. The output tensor dimensions are such that the query and key token dimensions are - * downsampled by `block_size` when compared to the *original* query and key tensors. - * @param attention_scores_data Pointer to the attention score input. - * @param attention_score_shape Shape of the attention score input tensor. Expected shape is [num_heads, - * num_query_tokens / stride, num_key_tokens / stride], where `num_query_tokens` and `num_key_tokens` must be - * multiples of `block_size`. - * @param out Pointer to the output tensor data (block sums) - * @param out_shape Shape of the output tensor data. Expected shape is [num_heads, num_query_tokens / block_size, - * num_key_tokens / block_size]. + /** Fills the diagonal of each square matrix slice (at ranks 1 and 2, zero-based) of the input rank-3 tensor with + * a provided value. The operation is done in-place. + * @param in_out Pointer to the matrix data. + * @param in_out_shape Shape of the matrix data. Expected shape is [num_heads, token_dim, token_dim]. + * @param val Value to fill in the diagonal positions. */ void fill_diagonal_(T* in_out, const Shape& in_out_shape, @@ -77,6 +63,12 @@ class AdaptiveRKVDiversityCalculator { } } + /** For a rank-3 tensor, zeroes out the values that are less than the mean of the values of the corresponding slice at rank 2 (zero-based). Ranks 1 and 2 of the input tensor must be equal. Mean values are computed and provided externally. The operation is done in-place. + * @param in_out Pointer to the tensor data. + * @param in_out_shape Shape of the tensor data. Expected shape is [num_heads, token_dim, token_dim]. + * @param means Pointer to the tensor data containing the means of each slice of the `in_out` tensor along its rank 2 (zero-based). + * @param means_shape Shape of the means tensor. Expected shape is [num_heads, token_dim]. + */ void fill_low_values_with_zeros_(T* in_out, const Shape& in_out_shape, const T* means, @@ -102,17 +94,23 @@ class AdaptiveRKVDiversityCalculator { } } - void block_sum_diversity_values(const T* processed_similarity_token_data, - const Shape& processed_similarity_token_data_shape, + /** For a square matrix, sums each `block_size`-sized group of matrix rows to produce a row in the output matrix. + * @param in_data Pointer to the matrix data. + * @param in_shape Shape of the matrix data. Expected shape is [token_dim, token_dim], where token_dim must be a multiple of `block_size`. + * @param out Pointer to the output matrix data. + * @param out_shape Shape of the output matrix. Expected shape is [token_dim / block_size, token_dim]. + */ + void block_sum_diversity_values(const T* in_data, + const Shape& in_shape, T* out, const Shape& out_shape) { - OPENVINO_ASSERT(processed_similarity_token_data_shape.size() == 2); // [token_dim, token_dim] - OPENVINO_ASSERT(processed_similarity_token_data_shape[0] == processed_similarity_token_data_shape[1]); - OPENVINO_ASSERT(processed_similarity_token_data_shape[0] % m_block_size == 0); + OPENVINO_ASSERT(in_shape.size() == 2); // [token_dim, token_dim] + OPENVINO_ASSERT(in_shape[0] == in_shape[1]); + OPENVINO_ASSERT(in_shape[0] % m_block_size == 0); OPENVINO_ASSERT(out_shape.size() == 2); // [block_dim, token_dim] - OPENVINO_ASSERT(out_shape[0] == processed_similarity_token_data_shape[0] / m_block_size); - OPENVINO_ASSERT(out_shape[1] == processed_similarity_token_data_shape[1]); + OPENVINO_ASSERT(out_shape[0] == in_shape[0] / m_block_size); + OPENVINO_ASSERT(out_shape[1] == in_shape[1]); std::memset(out, 0, out_shape[0] * out_shape[1] * sizeof(T)); @@ -121,28 +119,29 @@ class AdaptiveRKVDiversityCalculator { for (size_t out_token_dim_idx = 0; out_token_dim_idx < out_shape[1]; out_token_dim_idx++) { size_t in_block_offset = (out_block_dim_idx * m_block_size) * out_shape[1]; for (size_t in_token_in_block_idx = 0; in_token_in_block_idx < m_block_size; in_token_in_block_idx++) { - size_t source_offset = in_block_offset + in_token_in_block_idx * processed_similarity_token_data_shape[1] + out_token_dim_idx; - out[out_block_offset + out_token_dim_idx] -= processed_similarity_token_data[source_offset]; + size_t source_offset = in_block_offset + in_token_in_block_idx * in_shape[1] + out_token_dim_idx; + out[out_block_offset + out_token_dim_idx] -= in_data[source_offset]; } } } } - /** Applies XAttention to the provided query and key matrices, returning the subset of the most important blocks for - * each attention head, according to the configured block size and threshold, which are to be preserved in the - * subsequent sparse attention computation. - * @param query_data Pointer to the query input tensor data - * @param query_shape Shape of the query input tensor data. Expected shape is [num_heads, num_query_tokens, - * head_size], where `num_query_tokens` must be a multiple of both `block_size` and `stride`, padded with zeroes if - * necessary to do so in the real-world scenario. - * @param key_data Pointer to the key input tensor data + /** Calculates token diversity in the eviction area, partially aggregating the results per-block. The resulting + * diversity values have the shape of [num_eviction_blocks (== eviction_size / block_size), eviction_size]. Note + * that the 1-st rank is left unaggregated when compared to the full diversity calculation algorithm. The reason + * for this is as follows. The final per-block diversity value computation relies on knowing the subset of blocks + * in the eviction area that will be retained regardless of calculated diversity. This subset must be filtered out + * from the rank-1 dimension when performing reduce-mean in the original algorithm to get 1 diversity value per block + * in the eviction area. Due to implementation specifics the paged attention kernel does not know ahead of time which + * blocks will be "retained" - this information is only available on the openvino.genai level after the PA kernel has executed. + * Therefore the PA kernel will provide raw per-token values on the rank 1 of the returned diversity value matrix and delegatei + * the final reduce-mean and filtering to the openvino.genai level. + * @param key_data Pointer to the key cache tensor data * @param key_shape Shape of the key input tensor data. Expected shape is [num_heads, num_key_tokens, head_size], - * where `num_key_tokens` must be a multiple of both `block_size` and `stride`, padded with zeroes if necessary to - * do so in the real-world scenario. - * @return A vector of size `num_heads` of sets, each set containing pairs of block indices (.first is the block - * index along the query dimension, .second - along the key). Each set is the head-specific subset of blocks that - * must be preserved in the sparse attention computation. Indices are given in units of XAttention-specific - * `block_size` (as configured), which may differ from the block size in the paged attention implementation. + * where `num_key_tokens` must be no less than `start_size + eviction_size`. + * @return A rank-2 matrix in the std::vector representation with dimensions [eviction_size / block_size, eviction_size] containing + * the diversity values. The values are expected to be further mean-reduced along rank 1 (zero-based) at the point in time when the + * subset of blocks to be exclusively retained is known. */ std::vector> calculate_block_diversity(const T* key_data, const Shape& key_shape) { From ecd02cbcbc180a1db162527abf5ba7fb41d1ecd4 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Wed, 8 Oct 2025 13:01:45 +0200 Subject: [PATCH 4/8] Format --- .../reference/adaptive_rkv_diversity.hpp | 109 ++++++++++-------- .../reference/adaptive_rkv_diversity.cpp | 51 ++++---- 2 files changed, 86 insertions(+), 74 deletions(-) diff --git a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp index 2fbc906473dbcd..0da9a9bbfb6cc7 100644 --- a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp +++ b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp @@ -18,19 +18,19 @@ namespace ov::reference { - /** @brief Reference implementation of the Adaptive R-KV token diversity calculation mechanism * (https://arxiv.org/pdf/2505.24133v3) */ template class AdaptiveRKVDiversityCalculator { public: /** @param start_size Size, in tokens, of the key cache area that will be ignored for purposes of diversity - * calculation, starting from the beginning of the token dimension ("start area"). Must be a multiple of `block_size`. + * calculation, starting from the beginning of the token dimension ("start area"). Must be a multiple of + * `block_size`. * @param eviction_size Size, in tokens, from the beginning of the start area, the tokens in which will be - * considred for purposes of diversity calculation ("eviction area"). The rest of the tokens after the eviction area, - * if any, are ignored. Must be a multiple of `block_size`. - * @param block_size Block size of the underlying paged attention implementation. The diversity values will be sum-reduced - * from per-token values to per-block values based on this number of tokens in a block. + * considred for purposes of diversity calculation ("eviction area"). The rest of the tokens after the eviction + * area, if any, are ignored. Must be a multiple of `block_size`. + * @param block_size Block size of the underlying paged attention implementation. The diversity values will be + * sum-reduced from per-token values to per-block values based on this number of tokens in a block. * */ AdaptiveRKVDiversityCalculator(size_t start_size, size_t eviction_size, size_t block_size) : m_start_size(start_size), @@ -46,13 +46,10 @@ class AdaptiveRKVDiversityCalculator { * @param in_out_shape Shape of the matrix data. Expected shape is [num_heads, token_dim, token_dim]. * @param val Value to fill in the diagonal positions. */ - void fill_diagonal_(T* in_out, - const Shape& in_out_shape, - T val) { - OPENVINO_ASSERT(in_out_shape.size() == 3); // [num_heads, token_dim, token_dim] + void fill_diagonal_(T* in_out, const Shape& in_out_shape, T val) { + OPENVINO_ASSERT(in_out_shape.size() == 3); // [num_heads, token_dim, token_dim] OPENVINO_ASSERT(in_out_shape[1] == in_out_shape[2]); // [num_heads, token_dim, token_dim] - for (size_t head_idx = 0; head_idx < in_out_shape[0]; head_idx++) { size_t in_head_offset = head_idx * in_out_shape[1] * in_out_shape[2]; for (size_t token_dim_idx = 0; token_dim_idx < in_out_shape[1]; token_dim_idx++) { @@ -63,19 +60,19 @@ class AdaptiveRKVDiversityCalculator { } } - /** For a rank-3 tensor, zeroes out the values that are less than the mean of the values of the corresponding slice at rank 2 (zero-based). Ranks 1 and 2 of the input tensor must be equal. Mean values are computed and provided externally. The operation is done in-place. + /** For a rank-3 tensor, zeroes out the values that are less than the mean of the values of the corresponding slice + * at rank 2 (zero-based). Ranks 1 and 2 of the input tensor must be equal. Mean values are computed and provided + * externally. The operation is done in-place. * @param in_out Pointer to the tensor data. * @param in_out_shape Shape of the tensor data. Expected shape is [num_heads, token_dim, token_dim]. - * @param means Pointer to the tensor data containing the means of each slice of the `in_out` tensor along its rank 2 (zero-based). + * @param means Pointer to the tensor data containing the means of each slice of the `in_out` tensor along its rank + * 2 (zero-based). * @param means_shape Shape of the means tensor. Expected shape is [num_heads, token_dim]. */ - void fill_low_values_with_zeros_(T* in_out, - const Shape& in_out_shape, - const T* means, - const Shape& means_shape) { + void fill_low_values_with_zeros_(T* in_out, const Shape& in_out_shape, const T* means, const Shape& means_shape) { OPENVINO_ASSERT(in_out_shape.size() == 3); // [num_heads, token_dim, token_dim] OPENVINO_ASSERT(in_out_shape[1] == in_out_shape[2]); - OPENVINO_ASSERT(means_shape.size() == 2); // [num_heads, token_dim] + OPENVINO_ASSERT(means_shape.size() == 2); // [num_heads, token_dim] OPENVINO_ASSERT(means_shape[0] == in_out_shape[0]); OPENVINO_ASSERT(means_shape[1] == in_out_shape[1]); @@ -96,14 +93,12 @@ class AdaptiveRKVDiversityCalculator { /** For a square matrix, sums each `block_size`-sized group of matrix rows to produce a row in the output matrix. * @param in_data Pointer to the matrix data. - * @param in_shape Shape of the matrix data. Expected shape is [token_dim, token_dim], where token_dim must be a multiple of `block_size`. + * @param in_shape Shape of the matrix data. Expected shape is [token_dim, token_dim], where token_dim must be a + * multiple of `block_size`. * @param out Pointer to the output matrix data. * @param out_shape Shape of the output matrix. Expected shape is [token_dim / block_size, token_dim]. */ - void block_sum_diversity_values(const T* in_data, - const Shape& in_shape, - T* out, - const Shape& out_shape) { + void block_sum_diversity_values(const T* in_data, const Shape& in_shape, T* out, const Shape& out_shape) { OPENVINO_ASSERT(in_shape.size() == 2); // [token_dim, token_dim] OPENVINO_ASSERT(in_shape[0] == in_shape[1]); OPENVINO_ASSERT(in_shape[0] % m_block_size == 0); @@ -117,11 +112,11 @@ class AdaptiveRKVDiversityCalculator { for (size_t out_block_dim_idx = 0; out_block_dim_idx < out_shape[0]; out_block_dim_idx++) { size_t out_block_offset = out_block_dim_idx * out_shape[1]; for (size_t out_token_dim_idx = 0; out_token_dim_idx < out_shape[1]; out_token_dim_idx++) { - size_t in_block_offset = (out_block_dim_idx * m_block_size) * out_shape[1]; - for (size_t in_token_in_block_idx = 0; in_token_in_block_idx < m_block_size; in_token_in_block_idx++) { - size_t source_offset = in_block_offset + in_token_in_block_idx * in_shape[1] + out_token_dim_idx; - out[out_block_offset + out_token_dim_idx] -= in_data[source_offset]; - } + size_t in_block_offset = (out_block_dim_idx * m_block_size) * out_shape[1]; + for (size_t in_token_in_block_idx = 0; in_token_in_block_idx < m_block_size; in_token_in_block_idx++) { + size_t source_offset = in_block_offset + in_token_in_block_idx * in_shape[1] + out_token_dim_idx; + out[out_block_offset + out_token_dim_idx] -= in_data[source_offset]; + } } } } @@ -131,37 +126,54 @@ class AdaptiveRKVDiversityCalculator { * that the 1-st rank is left unaggregated when compared to the full diversity calculation algorithm. The reason * for this is as follows. The final per-block diversity value computation relies on knowing the subset of blocks * in the eviction area that will be retained regardless of calculated diversity. This subset must be filtered out - * from the rank-1 dimension when performing reduce-mean in the original algorithm to get 1 diversity value per block - * in the eviction area. Due to implementation specifics the paged attention kernel does not know ahead of time which - * blocks will be "retained" - this information is only available on the openvino.genai level after the PA kernel has executed. - * Therefore the PA kernel will provide raw per-token values on the rank 1 of the returned diversity value matrix and delegatei - * the final reduce-mean and filtering to the openvino.genai level. + * from the rank-1 dimension when performing reduce-mean in the original algorithm to get 1 diversity value per + * block in the eviction area. Due to implementation specifics the paged attention kernel does not know ahead of + * time which blocks will be "retained" - this information is only available on the openvino.genai level after the + * PA kernel has executed. Therefore the PA kernel will provide raw per-token values on the rank 1 of the returned + * diversity value matrix and delegatei the final reduce-mean and filtering to the openvino.genai level. * @param key_data Pointer to the key cache tensor data * @param key_shape Shape of the key input tensor data. Expected shape is [num_heads, num_key_tokens, head_size], * where `num_key_tokens` must be no less than `start_size + eviction_size`. - * @return A rank-2 matrix in the std::vector representation with dimensions [eviction_size / block_size, eviction_size] containing - * the diversity values. The values are expected to be further mean-reduced along rank 1 (zero-based) at the point in time when the - * subset of blocks to be exclusively retained is known. + * @return A rank-2 matrix in the std::vector representation with dimensions [eviction_size / block_size, + * eviction_size] containing the diversity values. The values are expected to be further mean-reduced along rank 1 + * (zero-based) at the point in time when the subset of blocks to be exclusively retained is known. */ - std::vector> calculate_block_diversity(const T* key_data, - const Shape& key_shape) { - OPENVINO_ASSERT(key_shape.size() == 3); // [num_heads, key_token_len, head_dim] + std::vector> calculate_block_diversity(const T* key_data, const Shape& key_shape) { + OPENVINO_ASSERT(key_shape.size() == 3); // [num_heads, key_token_len, head_dim] OPENVINO_ASSERT(key_shape[1] >= m_start_size + m_eviction_size); - auto normalized_key_data_buf = allocate_buf(key_shape); // Should be safe to use this in-place - ov::reference::normalize_l2(key_data, normalized_key_data_buf.get(), key_shape, {2}, std::numeric_limits::epsilon(), ov::op::EpsMode::ADD); + ov::reference::normalize_l2(key_data, + normalized_key_data_buf.get(), + key_shape, + {2}, + std::numeric_limits::epsilon(), + ov::op::EpsMode::ADD); Shape cos_similar_shape = {key_shape[0], key_shape[1], key_shape[1]}; auto cos_similar_buf = allocate_buf(cos_similar_shape); - ov::reference::matmul(normalized_key_data_buf.get(), normalized_key_data_buf.get(), cos_similar_buf.get(), key_shape, key_shape, cos_similar_shape, /* transpose_arg0 = */ false, /* transpose_arg1 = */ true); + ov::reference::matmul(normalized_key_data_buf.get(), + normalized_key_data_buf.get(), + cos_similar_buf.get(), + key_shape, + key_shape, + cos_similar_shape, + /* transpose_arg0 = */ false, + /* transpose_arg1 = */ true); normalized_key_data_buf.reset(); Shape evictable_subset_shape = {key_shape[0], m_eviction_size, m_eviction_size}; auto evictable_subset_buf = allocate_buf(evictable_subset_shape); // stops? - ov::reference::slice(reinterpret_cast(cos_similar_buf.get()), cos_similar_shape, reinterpret_cast(evictable_subset_buf.get()), evictable_subset_shape, sizeof(T), /* starts = */ {m_start_size, m_start_size}, /* steps = */ {1, 1}, /* axes = */{1, 2}); + ov::reference::slice(reinterpret_cast(cos_similar_buf.get()), + cos_similar_shape, + reinterpret_cast(evictable_subset_buf.get()), + evictable_subset_shape, + sizeof(T), + /* starts = */ {m_start_size, m_start_size}, + /* steps = */ {1, 1}, + /* axes = */ {1, 2}); cos_similar_buf.reset(); fill_diagonal_(evictable_subset_buf.get(), evictable_subset_shape, 0.0); @@ -175,12 +187,18 @@ class AdaptiveRKVDiversityCalculator { Shape aggregated_token_similarities_shape = {m_eviction_size, m_eviction_size}; auto aggregated_token_similarities_buf = allocate_buf(aggregated_token_similarities_shape); - ov::reference::reduce_mean(evictable_subset_buf.get(), aggregated_token_similarities_buf.get(), evictable_subset_shape, {0}); + ov::reference::reduce_mean(evictable_subset_buf.get(), + aggregated_token_similarities_buf.get(), + evictable_subset_shape, + {0}); evictable_subset_buf.reset(); Shape block_diversity_shape = {m_eviction_size / m_block_size, m_eviction_size}; auto block_diversity_buf = allocate_buf(block_diversity_shape); - block_sum_diversity_values(aggregated_token_similarities_buf.get(), aggregated_token_similarities_shape, block_diversity_buf.get(), block_diversity_shape); + block_sum_diversity_values(aggregated_token_similarities_buf.get(), + aggregated_token_similarities_shape, + block_diversity_buf.get(), + block_diversity_shape); std::vector> retval(block_diversity_shape[0], std::vector(block_diversity_shape[1])); for (size_t block_idx = 0; block_idx < block_diversity_shape[0]; block_idx++) { for (size_t token_idx = 0; token_idx < block_diversity_shape[1]; token_idx++) { @@ -199,7 +217,6 @@ class AdaptiveRKVDiversityCalculator { return std::shared_ptr(new T[ov::shape_size(shape)]); } - size_t m_start_size; size_t m_eviction_size; size_t m_block_size; diff --git a/src/core/tests/reference/adaptive_rkv_diversity.cpp b/src/core/tests/reference/adaptive_rkv_diversity.cpp index 6595162e1aa3ec..d3f71cdd3758c9 100644 --- a/src/core/tests/reference/adaptive_rkv_diversity.cpp +++ b/src/core/tests/reference/adaptive_rkv_diversity.cpp @@ -12,7 +12,6 @@ size_t DEFAULT_BLOCK_SIZE = 2; size_t DEFAULT_START_SIZE = 2; size_t DEFAULT_EVICTION_SIZE = 10; - TEST(AdaptiveRKVE2ESmokeTest, CalculatesDiversityWithoutThrowing) { ov::reference::AdaptiveRKVDiversityCalculator calculator(DEFAULT_START_SIZE, DEFAULT_EVICTION_SIZE, @@ -24,7 +23,6 @@ TEST(AdaptiveRKVE2ESmokeTest, CalculatesDiversityWithoutThrowing) { EXPECT_NO_THROW(calculator.calculate_block_diversity(mock_data.data(), mock_shape)); }; - struct FillDiagonalTestData { ov::Shape in_shape; std::vector in_data; @@ -33,10 +31,9 @@ struct FillDiagonalTestData { using AdaptiveRKVDiversityFillDiagonalTest = ::testing::TestWithParam; -std::vector FILL_DIAGONAL_TEST_CASES = { - { - {2, 4, 4}, - // clang-format off +std::vector FILL_DIAGONAL_TEST_CASES = {{ + {2, 4, 4}, + // clang-format off { 3.144, 8.512, 8.518, -8.386, 7.889, -5.721, 5.507, 4.295, @@ -48,9 +45,9 @@ std::vector FILL_DIAGONAL_TEST_CASES = { 3.469, 7.633, 7.244, -6.844, -7.173, 4.450, 6.705, -7.035 }, - // clang-format on + // clang-format on - // clang-format off + // clang-format off { 42.00, 8.512, 8.518, -8.386, 7.889, 42.00, 5.507, 4.295, @@ -62,21 +59,20 @@ std::vector FILL_DIAGONAL_TEST_CASES = { 3.469, 7.633, 42.00, -6.844, -7.173, 4.450, 6.705, 42.00 }, - // clang-format on - } -}; + // clang-format on +}}; TEST_P(AdaptiveRKVDiversityFillDiagonalTest, FillsDiagonal) { auto test_struct = GetParam(); ASSERT_EQ(test_struct.in_data.size(), ov::shape_size(test_struct.in_shape)); ASSERT_EQ(test_struct.ref_out_data.size(), ov::shape_size(test_struct.in_shape)); - ov::reference::AdaptiveRKVDiversityCalculator calculator(DEFAULT_START_SIZE, DEFAULT_EVICTION_SIZE, DEFAULT_BLOCK_SIZE); + ov::reference::AdaptiveRKVDiversityCalculator calculator(DEFAULT_START_SIZE, + DEFAULT_EVICTION_SIZE, + DEFAULT_BLOCK_SIZE); std::vector test_out_data = test_struct.in_data; - calculator.fill_diagonal_(test_out_data.data(), - test_struct.in_shape, - 42.0); + calculator.fill_diagonal_(test_out_data.data(), test_struct.in_shape, 42.0); EXPECT_EQ(test_out_data, test_struct.ref_out_data); } @@ -152,7 +148,10 @@ TEST_P(AdaptiveRKVFillLowValuesWithZerosTest, FillsLowValuesWithZero) { DEFAULT_EVICTION_SIZE, DEFAULT_BLOCK_SIZE); std::vector test_out_data = test_struct.in_data; - calculator.fill_low_values_with_zeros_(test_out_data.data(), test_struct.in_shape, test_struct.means.data(), test_struct.means_shape); + calculator.fill_low_values_with_zeros_(test_out_data.data(), + test_struct.in_shape, + test_struct.means.data(), + test_struct.means_shape); EXPECT_THAT(test_out_data, ::testing::Pointwise(::testing::DoubleNear(1e-8), test_struct.ref_out_data)); } @@ -161,7 +160,6 @@ INSTANTIATE_TEST_SUITE_P(VariousInputs, AdaptiveRKVFillLowValuesWithZerosTest, ::testing::ValuesIn(FILL_LOW_VALUES_WITH_ZEROS_TEST_CASES)); - struct BlockSumTestData { ov::Shape in_shape; std::vector in_data; @@ -409,17 +407,14 @@ std::vector E2E_DIVERSITY_TEST_CASES = { -9.120, -7.228, -9.186, 3.202, -9.304, -0.401, -5.287, 6.834 }, - // clang-format on + // clang-format on /* start_size = */ 2, /* eviction_size = */ 6, - { - {-0.237145, -0.237145, -0.352696, -0.487902, -0.072365, -0.707192}, - {-0.334657, -0.505941, 0, 0.036135, -0.634881,-0.490221}, - {-0.380811, -0.398746801, -0.432080003, -0.693021748, 0, 0.067216441} - }, - } -}; + {{-0.237145, -0.237145, -0.352696, -0.487902, -0.072365, -0.707192}, + {-0.334657, -0.505941, 0, 0.036135, -0.634881, -0.490221}, + {-0.380811, -0.398746801, -0.432080003, -0.693021748, 0, 0.067216441}}, + }}; TEST_P(AdaptiveRKVE2EDiversityTest, CalculatesDiversityCorrectly) { auto test_struct = GetParam(); @@ -434,10 +429,10 @@ TEST_P(AdaptiveRKVE2EDiversityTest, CalculatesDiversityCorrectly) { } for (size_t i = 0; i < test_diversity.size(); i++) { - EXPECT_THAT(test_diversity[i], ::testing::Pointwise(::testing::DoubleNear(1e-6), test_struct.ref_diversity_data[i])); + EXPECT_THAT(test_diversity[i], + ::testing::Pointwise(::testing::DoubleNear(1e-6), test_struct.ref_diversity_data[i])); } - }; INSTANTIATE_TEST_SUITE_P(VariousInputs, AdaptiveRKVE2EDiversityTest, ::testing::ValuesIn(E2E_DIVERSITY_TEST_CASES)); -} +} // namespace adaptive_rkv_test From fde6e790feb7ab8eead4a524fa86887b6af2a46a Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Thu, 9 Oct 2025 10:55:20 +0200 Subject: [PATCH 5/8] Fix comments --- .../openvino/reference/adaptive_rkv_diversity.hpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp index 0da9a9bbfb6cc7..5e10e251f95b59 100644 --- a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp +++ b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp @@ -4,17 +4,11 @@ #pragma once -#include -#include -#include -#include - #include "openvino/op/util/attr_types.hpp" #include "openvino/reference/matmul.hpp" #include "openvino/reference/normalize_l2.hpp" #include "openvino/reference/reduce_mean.hpp" #include "openvino/reference/slice.hpp" -#include "openvino/runtime/tensor.hpp" namespace ov::reference { @@ -27,7 +21,7 @@ class AdaptiveRKVDiversityCalculator { * calculation, starting from the beginning of the token dimension ("start area"). Must be a multiple of * `block_size`. * @param eviction_size Size, in tokens, from the beginning of the start area, the tokens in which will be - * considred for purposes of diversity calculation ("eviction area"). The rest of the tokens after the eviction + * considered for purposes of diversity calculation ("eviction area"). The rest of the tokens after the eviction * area, if any, are ignored. Must be a multiple of `block_size`. * @param block_size Block size of the underlying paged attention implementation. The diversity values will be * sum-reduced from per-token values to per-block values based on this number of tokens in a block. @@ -92,6 +86,8 @@ class AdaptiveRKVDiversityCalculator { } /** For a square matrix, sums each `block_size`-sized group of matrix rows to produce a row in the output matrix. + * In the overall algorithm context, each summed value represents diversity (the negative of inter-token cosine + * similarity), where larger absolute values indicate greater diversity. * @param in_data Pointer to the matrix data. * @param in_shape Shape of the matrix data. Expected shape is [token_dim, token_dim], where token_dim must be a * multiple of `block_size`. @@ -130,7 +126,7 @@ class AdaptiveRKVDiversityCalculator { * block in the eviction area. Due to implementation specifics the paged attention kernel does not know ahead of * time which blocks will be "retained" - this information is only available on the openvino.genai level after the * PA kernel has executed. Therefore the PA kernel will provide raw per-token values on the rank 1 of the returned - * diversity value matrix and delegatei the final reduce-mean and filtering to the openvino.genai level. + * diversity value matrix and delegate the final reduce-mean and filtering to the openvino.genai level. * @param key_data Pointer to the key cache tensor data * @param key_shape Shape of the key input tensor data. Expected shape is [num_heads, num_key_tokens, head_size], * where `num_key_tokens` must be no less than `start_size + eviction_size`. @@ -165,7 +161,6 @@ class AdaptiveRKVDiversityCalculator { Shape evictable_subset_shape = {key_shape[0], m_eviction_size, m_eviction_size}; auto evictable_subset_buf = allocate_buf(evictable_subset_shape); - // stops? ov::reference::slice(reinterpret_cast(cos_similar_buf.get()), cos_similar_shape, reinterpret_cast(evictable_subset_buf.get()), @@ -173,7 +168,7 @@ class AdaptiveRKVDiversityCalculator { sizeof(T), /* starts = */ {m_start_size, m_start_size}, /* steps = */ {1, 1}, - /* axes = */ {1, 2}); + /* axes = */ {1, 2}); // stops are defined by output shape cos_similar_buf.reset(); fill_diagonal_(evictable_subset_buf.get(), evictable_subset_shape, 0.0); From 809acff183d4ef6f9c1fd99a4cef0da6120c9434 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Thu, 9 Oct 2025 13:38:12 +0200 Subject: [PATCH 6/8] Fix format --- .../include/openvino/reference/adaptive_rkv_diversity.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp index 5e10e251f95b59..9ddd371c5df14a 100644 --- a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp +++ b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp @@ -168,7 +168,7 @@ class AdaptiveRKVDiversityCalculator { sizeof(T), /* starts = */ {m_start_size, m_start_size}, /* steps = */ {1, 1}, - /* axes = */ {1, 2}); // stops are defined by output shape + /* axes = */ {1, 2}); // stops are defined by output shape cos_similar_buf.reset(); fill_diagonal_(evictable_subset_buf.get(), evictable_subset_shape, 0.0); From 44000b48dd49c3e53893ff5e92d2220e245ff897 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 10 Oct 2025 10:59:03 +0200 Subject: [PATCH 7/8] Fix warnings --- .../include/openvino/reference/adaptive_rkv_diversity.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp index 9ddd371c5df14a..2a0c4b7bca56a2 100644 --- a/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp +++ b/src/core/reference/include/openvino/reference/adaptive_rkv_diversity.hpp @@ -166,7 +166,7 @@ class AdaptiveRKVDiversityCalculator { reinterpret_cast(evictable_subset_buf.get()), evictable_subset_shape, sizeof(T), - /* starts = */ {m_start_size, m_start_size}, + /* starts = */ {static_cast(m_start_size), static_cast(m_start_size)}, /* steps = */ {1, 1}, /* axes = */ {1, 2}); // stops are defined by output shape cos_similar_buf.reset(); From 82d425c5d91a2bf26d322c039ad2d2a2cfc0948b Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Sat, 11 Oct 2025 11:16:53 +0200 Subject: [PATCH 8/8] mt