From 66248d207c730525b33b39f2577cb2462804bc56 Mon Sep 17 00:00:00 2001 From: yael-works Date: Tue, 28 Oct 2025 11:25:39 +0200 Subject: [PATCH 01/15] Add skeleton for GGML_OP_SPARSEK_ATTN (SparseK Attention): new operator definition and tensor creation, backend implementation pending to ggml.c/h Co-authored-by: Yael Shuker Co-authored-by: Gitty Burstein --- ggml/include/ggml.h | 12 +++++++++++- ggml/src/ggml.c | 46 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d948b00cc7f30..c47c5404c9c3b 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -529,7 +529,7 @@ extern "C" { GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, GGML_OP_LEAKY_RELU, - + GGML_OP_SPARSEK_ATTN, GGML_OP_FLASH_ATTN_EXT, GGML_OP_FLASH_ATTN_BACK, GGML_OP_SSM_CONV, @@ -2231,6 +2231,16 @@ extern "C" { // n_head % ne32 == 0 // ne3 % ne33 == 0 // + + GGML_API struct ggml_tensor * ggml_sparsek_attn( + struct ggml_context * ctx, + struct ggml_tensor * Q, + struct ggml_tensor * K, + struct ggml_tensor * V, + int32_t k_top, + int32_t win_local, + int32_t stride_global); + GGML_API struct ggml_tensor * ggml_flash_attn_ext( struct ggml_context * ctx, struct ggml_tensor * q, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9be35c1be8456..6aec78051a3c8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1019,7 +1019,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90"); +static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1094,7 +1094,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", "leaky_relu(x)", - + "sparsek_attn(Q, K, V, k_top, win_local, stride_global)", "flash_attn_ext(x)", "flash_attn_back(x)", "ssm_conv(x)", @@ -1123,7 +1123,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90"); +static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -5063,6 +5063,46 @@ struct ggml_tensor * ggml_top_k( return result; } +// ggml_sparsek_attn +struct ggml_tensor * ggml_sparsek_attn( + struct ggml_context * ctx, + struct ggml_tensor * Q, + struct ggml_tensor * K, + struct ggml_tensor * V, + int32_t k_top, + int32_t win_local, + int32_t stride_global) { + + // ביטול אזהרות (אם טרם משתמשים בפרמטרים) + GGML_UNUSED(k_top); + GGML_UNUSED(win_local); + GGML_UNUSED(stride_global); + + // בדיקות תקינות בסיסיות + GGML_ASSERT(Q != NULL); + GGML_ASSERT(K != NULL); + GGML_ASSERT(V != NULL); + GGML_ASSERT(ggml_can_mul_mat(K, Q)); + + // יצירת טנזור פלט בממדים המתאימים + int64_t ne[GGML_MAX_DIMS] = { V->ne[0], Q->ne[2], Q->ne[1], Q->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, ne); + + // הגדרת סוג האופרטור והמקורות + result->op = GGML_OP_SPARSEK_ATTN; + result->src[0] = Q; + result->src[1] = K; + result->src[2] = V; + + // שמירת הפרמטרים המספריים במערך op_params (שיטה הנהוגה ב־ggml) + result->op_params[0] = k_top; + result->op_params[1] = win_local; + result->op_params[2] = stride_global; + + return result; +} + + // ggml_flash_attn_ext struct ggml_tensor * ggml_flash_attn_ext( From 5d6d3b771a47bee4a316e5a197cb99b6af4131a0 Mon Sep 17 00:00:00 2001 From: yael-works Date: Tue, 28 Oct 2025 14:06:10 +0200 Subject: [PATCH 02/15] Add CPU support for SparseK Attention (without performance checks) Co-authored-by: Yael Shuker Co-authored-by: Gitty Burstein --- ggml/src/ggml-cpu/ggml-cpu.c | 5 +++ ggml/src/ggml-cpu/ops.cpp | 82 ++++++++++++++++++++++++++++++++++++ ggml/src/ggml-cpu/ops.h | 2 + ggml/src/ggml.c | 46 +++++++++++--------- tests/test-backend-ops.cpp | 61 ++++++++++++++++++++++++++- 5 files changed, 174 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 9ec485cfa2ff7..b43a2b437d8dc 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1952,6 +1952,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_flash_attn_ext(params, tensor); } break; + case GGML_OP_SPARSEK_ATTN: + { + ggml_compute_forward_sparsek_attn(params, tensor); + break; + } case GGML_OP_FLASH_ATTN_BACK: { int32_t t = ggml_get_op_params_i32(tensor, 0); diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 3156bd60101d7..5bc0cb3e298c7 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7907,6 +7907,88 @@ void ggml_compute_forward_argsort( } } +//------------------------------------------------------------------------------ +// SparseK Attention (CPU) +//------------------------------------------------------------------------------ + +static void ggml_compute_forward_sparsek_attn_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + if (params->ith != 0) return; // main thread only + + const struct ggml_tensor * Q = dst->src[0]; + const struct ggml_tensor * K = dst->src[1]; + const struct ggml_tensor * V = dst->src[2]; + + GGML_ASSERT(Q && K && V); + GGML_ASSERT(Q->type == GGML_TYPE_F32); + GGML_ASSERT(K->type == GGML_TYPE_F32); + GGML_ASSERT(V->type == GGML_TYPE_F32); + + const int32_t k_top = ggml_get_op_params_i32(dst, 0); + const int32_t win_local = ggml_get_op_params_i32(dst, 1); + const int32_t stride_glb = ggml_get_op_params_i32(dst, 2); + + const int64_t D = Q->ne[0]; // embedding dim + const int64_t T = Q->ne[1]; // sequence length + + const float * q = (const float *) Q->data; + const float * k = (const float *) K->data; + const float * v = (const float *) V->data; + float * out = (float *) dst->data; + + + for (int64_t i = 0; i < T; ++i) { + for (int64_t j = 0; j < T; ++j) { + float dot = 0.0f; + for (int64_t d = 0; d < D; ++d) + dot += q[i*D + d] * k[j*D + d]; + out[i*T + j] = dot / sqrtf((float) D); + } + } + + for (int64_t i = 0; i < T; ++i) { + float * row = &out[i*T]; + for (int64_t j = 0; j < T; ++j) + if (row[j] < row[k_top]) row[j] = -INFINITY; + } + + for (int64_t i = 0; i < T; ++i) { + float maxv = -INFINITY; + for (int64_t j = 0; j < T; ++j) + if (out[i*T + j] > maxv) maxv = out[i*T + j]; + float sum = 0.0f; + for (int64_t j = 0; j < T; ++j) { + out[i*T + j] = expf(out[i*T + j] - maxv); + sum += out[i*T + j]; + } + for (int64_t j = 0; j < T; ++j) + out[i*T + j] /= sum; + } + + + float * result = (float *) dst->data; + for (int64_t i = 0; i < T; ++i) { + for (int64_t d = 0; d < D; ++d) { + float sum = 0.0f; + for (int64_t j = 0; j < T; ++j) + sum += out[i*T + j] * v[j*D + d]; + result[i*D + d] = sum; + } + } + + GGML_PRINT_DEBUG("[SPARSEK CPU] k_top=%d win_local=%d stride=%d\n", + k_top, win_local, stride_glb); +} + +void ggml_compute_forward_sparsek_attn( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + ggml_compute_forward_sparsek_attn_f32(params, dst); +} + + // ggml_compute_forward_flash_attn_ext static void ggml_compute_forward_flash_attn_ext_f16( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 9824a03b45833..e43b23a5587bd 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -86,6 +86,8 @@ void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_sparsek_attn(const struct ggml_compute_params * params, struct ggml_tensor * dst); + void ggml_compute_forward_flash_attn_back( const struct ggml_compute_params * params, const bool masked, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 6aec78051a3c8..9ad055c994672 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -990,7 +990,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "TIMESTEP_EMBEDDING", "ARGSORT", "LEAKY_RELU", - + "SPARSEK_ATTN", "FLASH_ATTN_EXT", "FLASH_ATTN_BACK", "SSM_CONV", @@ -1094,7 +1094,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", "leaky_relu(x)", - "sparsek_attn(Q, K, V, k_top, win_local, stride_global)", + "sparsek_attn(x)", "flash_attn_ext(x)", "flash_attn_back(x)", "ssm_conv(x)", @@ -5073,36 +5073,42 @@ struct ggml_tensor * ggml_sparsek_attn( int32_t win_local, int32_t stride_global) { - // ביטול אזהרות (אם טרם משתמשים בפרמטרים) - GGML_UNUSED(k_top); - GGML_UNUSED(win_local); - GGML_UNUSED(stride_global); - - // בדיקות תקינות בסיסיות - GGML_ASSERT(Q != NULL); - GGML_ASSERT(K != NULL); - GGML_ASSERT(V != NULL); GGML_ASSERT(ggml_can_mul_mat(K, Q)); + GGML_ASSERT(Q->ne[3] == K->ne[3] && Q->ne[3] == V->ne[3]); + + int64_t ne[4] = { V->ne[0], Q->ne[2], Q->ne[1], Q->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + - // יצירת טנזור פלט בממדים המתאימים - int64_t ne[GGML_MAX_DIMS] = { V->ne[0], Q->ne[2], Q->ne[1], Q->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, ne); + int32_t params_i32[3] = { k_top, win_local, stride_global }; + ggml_set_op_params(result, params_i32, sizeof(params_i32)); - // הגדרת סוג האופרטור והמקורות result->op = GGML_OP_SPARSEK_ATTN; result->src[0] = Q; result->src[1] = K; result->src[2] = V; - // שמירת הפרמטרים המספריים במערך op_params (שיטה הנהוגה ב־ggml) - result->op_params[0] = k_top; - result->op_params[1] = win_local; - result->op_params[2] = stride_global; - return result; } +void ggml_sparsek_attn_set_params(struct ggml_tensor * a, + int32_t k_top, + int32_t win_local, + int32_t stride_global) { + GGML_ASSERT(a->op == GGML_OP_SPARSEK_ATTN); + ggml_set_op_params_i32(a, 0, k_top); + ggml_set_op_params_i32(a, 1, win_local); + ggml_set_op_params_i32(a, 2, stride_global); +} + +int32_t ggml_sparsek_attn_get_param(const struct ggml_tensor * a, int index) { + GGML_ASSERT(a->op == GGML_OP_SPARSEK_ATTN); + return ggml_get_op_params_i32(a, index); +} + + + // ggml_flash_attn_ext struct ggml_tensor * ggml_flash_attn_ext( diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index aee1730137900..e899bb8c50168 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1778,6 +1778,7 @@ struct test_example : public test_case { }; + // GGML_OP_UNARY struct test_unary : public test_case { const ggml_unary_op op; @@ -5362,7 +5363,46 @@ struct test_leaky_relu : public test_case { } }; -// GGML_OP_FLASH_ATTN_EXT +// GGML_OP_SPARSEK_ATTN +struct test_sparsek_attn : public test_case { + const int64_t d_qk; + const int64_t d_v; + const int64_t n_head; + const int64_t n_tokens; + const int64_t batch; + const int32_t k_top; + const int32_t win_local; + const int32_t stride_global; + + std::string vars() override { + return VARS_TO_STR9(d_qk, d_v, n_head, n_tokens, batch, k_top, win_local, stride_global, 0); + } + + test_sparsek_attn(int64_t d_qk = 128, int64_t d_v = 128, int64_t n_head = 8, + int64_t n_tokens = 256, int64_t batch = 4, + int32_t k_top = 32, int32_t win_local = 64, int32_t stride_global = 128) + : d_qk(d_qk), d_v(d_v), n_head(n_head), n_tokens(n_tokens), batch(batch), + k_top(k_top), win_local(win_local), stride_global(stride_global) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + const int64_t n_q = n_tokens; + ggml_tensor * Q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_qk, n_q, n_head, batch); + ggml_set_name(Q, "Q"); + ggml_tensor * K = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_qk, n_tokens, n_head, batch); + ggml_set_name(K, "K"); + ggml_tensor * V = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_v, n_tokens, n_head, batch); + ggml_set_name(V, "V"); + + ggml_tensor * out = ggml_sparsek_attn(ctx, Q, K, V, k_top, win_local, stride_global); + ggml_set_name(out, "SPARSEK_ATTN_out"); + + return out; + } +}; + + + +// GGML_OP_FLAsH_ATTN_EXT struct test_flash_attn_ext : public test_case { const int64_t hsk; // K head size const int64_t hsv; // V head size @@ -7095,7 +7135,7 @@ static std::vector> make_test_cases_eval() { if (hsk != 192 && hsk != 576 && hsk != hsv) continue; if (hsk == 192 && (hsv != 128 && hsv != 192)) continue; if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA - + for (bool mask : { true, false } ) { for (bool sinks : { true, false } ) { for (float max_bias : { 0.0f, 8.0f }) { @@ -7134,6 +7174,23 @@ static std::vector> make_test_cases_eval() { } } } + // ---- SPARSEK_ATTN -------------------------------------------------- + for (int64_t d_qk : {64, 128}) { + for (int64_t d_v : {64, 128}) { + for (int64_t n_head : {4, 8}) { + for (int64_t kv : {113, 512}) { + for (int64_t b : {1, 4}) { + for (int32_t k_top : {16, 32}) { + for (int32_t win_local : {32, 64}) { + test_cases.emplace_back(new test_sparsek_attn( + d_qk, d_v, n_head, kv, b, k_top, win_local, /*stride_global*/128)); + } + } + } + } + } + } + } test_cases.emplace_back(new test_cross_entropy_loss (GGML_TYPE_F32, { 10, 5, 4, 3})); test_cases.emplace_back(new test_cross_entropy_loss (GGML_TYPE_F32, {30000, 1, 1, 1})); From a5daf2fede36aa1581a429bd070aa0a1206edea2 Mon Sep 17 00:00:00 2001 From: yael-works Date: Wed, 29 Oct 2025 12:47:25 +0200 Subject: [PATCH 03/15] fix: add missing prototypes for ggml_sparsek_attn_set/get_params in ggml.h Co-authored-by: Yael Shuker Co-authored-by: Gitty Burstein --- ggml/include/ggml.h | 10 +++++++ ggml/tests/test_sparsek_cpu.c | 40 +++++++++++++++++++++++++++ tests/test_sparsek_cpu.c | 50 ++++++++++++++++++++++++++++++++++ tmp-test/test_sparsek_cpu.c | 51 +++++++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+) create mode 100644 ggml/tests/test_sparsek_cpu.c create mode 100644 tests/test_sparsek_cpu.c create mode 100644 tmp-test/test_sparsek_cpu.c diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index c47c5404c9c3b..25c8343fc3315 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2241,6 +2241,16 @@ extern "C" { int32_t win_local, int32_t stride_global); + GGML_API void ggml_sparsek_attn_set_params( + struct ggml_tensor * a, + int32_t k_top, + int32_t win_local, + int32_t stride_global); + + GGML_API int32_t ggml_sparsek_attn_get_param( + const struct ggml_tensor * a, + int index); + GGML_API struct ggml_tensor * ggml_flash_attn_ext( struct ggml_context * ctx, struct ggml_tensor * q, diff --git a/ggml/tests/test_sparsek_cpu.c b/ggml/tests/test_sparsek_cpu.c new file mode 100644 index 0000000000000..9cc82681d9356 --- /dev/null +++ b/ggml/tests/test_sparsek_cpu.c @@ -0,0 +1,40 @@ +#include "ggml.h" +#include +#include + +int main() { + struct ggml_init_params params = { + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + struct ggml_context *ctx = ggml_init(params); + + // יצירת טנזורים קטנים לבדיקה + int n = 2; + float q_data[4] = {1.0, 2.0, 3.0, 4.0}; + float k_data[4] = {1.0, 0.0, 0.0, 1.0}; + float v_data[4] = {5.0, 6.0, 7.0, 8.0}; + + struct ggml_tensor *Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); + struct ggml_tensor *K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); + struct ggml_tensor *V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); + + memcpy(Q->data, q_data, sizeof(q_data)); + memcpy(K->data, k_data, sizeof(k_data)); + memcpy(V->data, v_data, sizeof(v_data)); + + printf("Running ggml_sparsek_attn CPU test...\n"); + struct ggml_tensor *Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0); + + ggml_build_forward_expand(NULL, Y); + ggml_graph_compute_with_ctx(ctx, NULL, 1); + + printf("Output tensor:\n"); + for (int i = 0; i < n*n; ++i) + printf("%.6f ", ((float*)Y->data)[i]); + printf("\n"); + + ggml_free(ctx); + return 0; +} diff --git a/tests/test_sparsek_cpu.c b/tests/test_sparsek_cpu.c new file mode 100644 index 0000000000000..0f6c082ed2f31 --- /dev/null +++ b/tests/test_sparsek_cpu.c @@ -0,0 +1,50 @@ +#define GGML_USE_DEFAULT_BACKEND 1 +#include "ggml.h" +#include +#include +#include // memcpy + +// הצהרה קדמית לפונקציה הישנה של ggml +void ggml_graph_compute(struct ggml_context * ctx, struct ggml_tensor * tensor, int n_threads); + +int main() { + struct ggml_init_params params = { + .mem_size = 16 * 1024 * 1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + struct ggml_context * ctx = ggml_init(params); + + // ניצור טנזורים קטנים לבדיקה + int n = 2; + float q_data[4] = {1.0, 2.0, 3.0, 4.0}; + float k_data[4] = {1.0, 0.0, 0.0, 1.0}; + float v_data[4] = {5.0, 6.0, 7.0, 8.0}; + + struct ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); + struct ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); + struct ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); + + memcpy(Q->data, q_data, sizeof(q_data)); + memcpy(K->data, k_data, sizeof(k_data)); + memcpy(V->data, v_data, sizeof(v_data)); + + printf("Running ggml_sparsek_attn CPU test...\n"); + + struct ggml_tensor * Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0); + + ggml_build_forward_expand(NULL, Y); + ggml_graph_compute(ctx, Y, 1); + + printf("SPARSEK CPU test finished successfully.\n"); + printf("Output tensor:\n"); + + for (int i = 0; i < n * n; ++i) { + printf("%.6f ", ((float *)Y->data)[i]); + } + printf("\n"); + + ggml_free(ctx); + return 0; +} diff --git a/tmp-test/test_sparsek_cpu.c b/tmp-test/test_sparsek_cpu.c new file mode 100644 index 0000000000000..8358e01f9d612 --- /dev/null +++ b/tmp-test/test_sparsek_cpu.c @@ -0,0 +1,51 @@ +#include "ggml.h" +#include +#include +#include // בשביל memcpy + +// הצהרה קדמית לפונקציה הישנה +void ggml_graph_compute(struct ggml_context * ctx, struct ggml_tensor * tensor, int n_threads); + +int main() { + struct ggml_init_params params = { + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + struct ggml_context * ctx = ggml_init(params); + + // טנזורים קטנים לבדיקה + int n = 2; + float q_data[4] = {1.0, 2.0, 3.0, 4.0}; + float k_data[4] = {1.0, 0.0, 0.0, 1.0}; + float v_data[4] = {5.0, 6.0, 7.0, 8.0}; + + struct ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); + struct ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); + struct ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); + + memcpy(Q->data, q_data, sizeof(q_data)); + memcpy(K->data, k_data, sizeof(k_data)); + memcpy(V->data, v_data, sizeof(v_data)); + + printf("Running ggml_sparsek_attn CPU test...\n"); + + // קריאה לפונקציה שלך + struct ggml_tensor * Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0); + + // חישוב + ggml_build_forward_expand(NULL, Y); + ggml_graph_compute(ctx, Y, 1); + + printf("SPARSEK CPU test finished successfully.\n"); + printf("Output tensor:\n"); + + for (int i = 0; i < n * n; ++i) { + printf("%.6f ", ((float *)Y->data)[i]); + } + printf("\n"); + + ggml_free(ctx); + return 0; +} From 39a117f95a189e369669c2b8d9d9a07ca24902e9 Mon Sep 17 00:00:00 2001 From: Gitty Burstein Date: Thu, 30 Oct 2025 09:39:13 +0200 Subject: [PATCH 04/15] fix SparseK CPU operator implementation Co-authored-by: Yael Shuker Co-authored-by: Gitty Burstein --- ggml/src/ggml-cpu/ggml-cpu.c | 3 +- ggml/src/ggml-cpu/ops.cpp | 121 ++++++++++++++++++++++------------ ggml/tests/test_sparsek_cpu.c | 40 ----------- tests/test_sparsek_cpu.c | 50 -------------- tmp-test/test_sparsek_cpu.c | 51 -------------- 5 files changed, 81 insertions(+), 184 deletions(-) delete mode 100644 ggml/tests/test_sparsek_cpu.c delete mode 100644 tests/test_sparsek_cpu.c delete mode 100644 tmp-test/test_sparsek_cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index b43a2b437d8dc..275d1a22fd381 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1955,8 +1955,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm case GGML_OP_SPARSEK_ATTN: { ggml_compute_forward_sparsek_attn(params, tensor); - break; - } + } break; case GGML_OP_FLASH_ATTN_BACK: { int32_t t = ggml_get_op_params_i32(tensor, 0); diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 5bc0cb3e298c7..788b5e8954f75 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -9,6 +9,7 @@ #include #include +#include // ggml_compute_forward_dup @@ -7915,7 +7916,8 @@ static void ggml_compute_forward_sparsek_attn_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - if (params->ith != 0) return; // main thread only + // Single-threaded baseline version (expand later for parallelism) + if (params->ith != 0) return; const struct ggml_tensor * Q = dst->src[0]; const struct ggml_tensor * K = dst->src[1]; @@ -7925,56 +7927,87 @@ static void ggml_compute_forward_sparsek_attn_f32( GGML_ASSERT(Q->type == GGML_TYPE_F32); GGML_ASSERT(K->type == GGML_TYPE_F32); GGML_ASSERT(V->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); const int32_t k_top = ggml_get_op_params_i32(dst, 0); const int32_t win_local = ggml_get_op_params_i32(dst, 1); const int32_t stride_glb = ggml_get_op_params_i32(dst, 2); + GGML_UNUSED(win_local); + GGML_UNUSED(stride_glb); - const int64_t D = Q->ne[0]; // embedding dim - const int64_t T = Q->ne[1]; // sequence length + // Tensor dimensions according to GGML layout: ne[0]=d, ne[1]=seq, ne[2]=head, ne[3]=batch + const int64_t D = Q->ne[0]; + const int64_t T = Q->ne[1]; + const int64_t H = Q->ne[2]; + const int64_t B = Q->ne[3]; - const float * q = (const float *) Q->data; - const float * k = (const float *) K->data; - const float * v = (const float *) V->data; - float * out = (float *) dst->data; + // Temporary buffer for attention scores for one query row + std::vector attn_row(T, 0.0f); - - for (int64_t i = 0; i < T; ++i) { - for (int64_t j = 0; j < T; ++j) { - float dot = 0.0f; - for (int64_t d = 0; d < D; ++d) - dot += q[i*D + d] * k[j*D + d]; - out[i*T + j] = dot / sqrtf((float) D); - } - } + const float scale = 1.0f / sqrtf((float) D); - for (int64_t i = 0; i < T; ++i) { - float * row = &out[i*T]; - for (int64_t j = 0; j < T; ++j) - if (row[j] < row[k_top]) row[j] = -INFINITY; - } + // Loops over batch, head, and query token + for (int64_t b = 0; b < B; ++b) { + for (int64_t h = 0; h < H; ++h) { + for (int64_t iq = 0; iq < T; ++iq) { - for (int64_t i = 0; i < T; ++i) { - float maxv = -INFINITY; - for (int64_t j = 0; j < T; ++j) - if (out[i*T + j] > maxv) maxv = out[i*T + j]; - float sum = 0.0f; - for (int64_t j = 0; j < T; ++j) { - out[i*T + j] = expf(out[i*T + j] - maxv); - sum += out[i*T + j]; - } - for (int64_t j = 0; j < T; ++j) - out[i*T + j] /= sum; - } + // (1) Compute dot products Q·K within same (b,h) + const char * qbase = (const char *) Q->data + b*Q->nb[3] + h*Q->nb[2] + iq*Q->nb[1]; + const float * qv = (const float *) qbase; + for (int64_t j = 0; j < T; ++j) { + const char * kbase = (const char *) K->data + b*K->nb[3] + h*K->nb[2] + j*K->nb[1]; + const float * kv = (const float *) kbase; - float * result = (float *) dst->data; - for (int64_t i = 0; i < T; ++i) { - for (int64_t d = 0; d < D; ++d) { - float sum = 0.0f; - for (int64_t j = 0; j < T; ++j) - sum += out[i*T + j] * v[j*D + d]; - result[i*D + d] = sum; + float dot = 0.0f; + for (int64_t d = 0; d < D; ++d) { + dot += qv[d] * kv[d]; + } + attn_row[j] = dot * scale; + } + + // (2) Select top-k threshold using nth_element + const int kk = std::max(1, std::min((int)T, k_top)); + std::vector tmp(attn_row.begin(), attn_row.end()); + std::nth_element(tmp.begin(), tmp.begin() + (kk - 1), tmp.end(), std::greater()); + const float thr = tmp[kk - 1]; + + for (int64_t j = 0; j < T; ++j) { + if (attn_row[j] < thr) attn_row[j] = -INFINITY; + } + + // (3) Numerically stable softmax on the masked row + float maxv = -INFINITY; + for (int64_t j = 0; j < T; ++j) { + maxv = std::max(maxv, attn_row[j]); + } + float sum = 0.0f; + for (int64_t j = 0; j < T; ++j) { + float v = attn_row[j] - maxv; + float e = expf(v); + attn_row[j] = e; + sum += e; + } + const float inv_sum = sum > 0.0f ? 1.0f / sum : 0.0f; + for (int64_t j = 0; j < T; ++j) { + attn_row[j] *= inv_sum; + } + + // (4) Compute output = A·V (weighted sum) + float * y = (float *) ((char *) dst->data + b*dst->nb[3] + h*dst->nb[2] + iq*dst->nb[1]); + + for (int64_t d = 0; d < D; ++d) { + float acc = 0.0f; + for (int64_t j = 0; j < T; ++j) { + const float aij = attn_row[j]; + if (aij == 0.0f) continue; // skip masked entries + const char * vbase = (const char *) V->data + b*V->nb[3] + h*V->nb[2] + j*V->nb[1]; + const float * vv = (const float *) vbase; + acc += aij * vv[d]; + } + y[d] = acc; + } + } } } @@ -7985,7 +8018,13 @@ static void ggml_compute_forward_sparsek_attn_f32( void ggml_compute_forward_sparsek_attn( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - ggml_compute_forward_sparsek_attn_f32(params, dst); + switch (dst->type) { + case GGML_TYPE_F32: + ggml_compute_forward_sparsek_attn_f32(params, dst); + break; + default: + GGML_ASSERT(false && "sparsek_attn: unsupported dst type"); + } } diff --git a/ggml/tests/test_sparsek_cpu.c b/ggml/tests/test_sparsek_cpu.c deleted file mode 100644 index 9cc82681d9356..0000000000000 --- a/ggml/tests/test_sparsek_cpu.c +++ /dev/null @@ -1,40 +0,0 @@ -#include "ggml.h" -#include -#include - -int main() { - struct ggml_init_params params = { - .mem_size = 16*1024*1024, - .mem_buffer = NULL, - .no_alloc = false, - }; - struct ggml_context *ctx = ggml_init(params); - - // יצירת טנזורים קטנים לבדיקה - int n = 2; - float q_data[4] = {1.0, 2.0, 3.0, 4.0}; - float k_data[4] = {1.0, 0.0, 0.0, 1.0}; - float v_data[4] = {5.0, 6.0, 7.0, 8.0}; - - struct ggml_tensor *Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); - struct ggml_tensor *K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); - struct ggml_tensor *V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); - - memcpy(Q->data, q_data, sizeof(q_data)); - memcpy(K->data, k_data, sizeof(k_data)); - memcpy(V->data, v_data, sizeof(v_data)); - - printf("Running ggml_sparsek_attn CPU test...\n"); - struct ggml_tensor *Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0); - - ggml_build_forward_expand(NULL, Y); - ggml_graph_compute_with_ctx(ctx, NULL, 1); - - printf("Output tensor:\n"); - for (int i = 0; i < n*n; ++i) - printf("%.6f ", ((float*)Y->data)[i]); - printf("\n"); - - ggml_free(ctx); - return 0; -} diff --git a/tests/test_sparsek_cpu.c b/tests/test_sparsek_cpu.c deleted file mode 100644 index 0f6c082ed2f31..0000000000000 --- a/tests/test_sparsek_cpu.c +++ /dev/null @@ -1,50 +0,0 @@ -#define GGML_USE_DEFAULT_BACKEND 1 -#include "ggml.h" -#include -#include -#include // memcpy - -// הצהרה קדמית לפונקציה הישנה של ggml -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_tensor * tensor, int n_threads); - -int main() { - struct ggml_init_params params = { - .mem_size = 16 * 1024 * 1024, - .mem_buffer = NULL, - .no_alloc = false, - }; - - struct ggml_context * ctx = ggml_init(params); - - // ניצור טנזורים קטנים לבדיקה - int n = 2; - float q_data[4] = {1.0, 2.0, 3.0, 4.0}; - float k_data[4] = {1.0, 0.0, 0.0, 1.0}; - float v_data[4] = {5.0, 6.0, 7.0, 8.0}; - - struct ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); - struct ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); - struct ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); - - memcpy(Q->data, q_data, sizeof(q_data)); - memcpy(K->data, k_data, sizeof(k_data)); - memcpy(V->data, v_data, sizeof(v_data)); - - printf("Running ggml_sparsek_attn CPU test...\n"); - - struct ggml_tensor * Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0); - - ggml_build_forward_expand(NULL, Y); - ggml_graph_compute(ctx, Y, 1); - - printf("SPARSEK CPU test finished successfully.\n"); - printf("Output tensor:\n"); - - for (int i = 0; i < n * n; ++i) { - printf("%.6f ", ((float *)Y->data)[i]); - } - printf("\n"); - - ggml_free(ctx); - return 0; -} diff --git a/tmp-test/test_sparsek_cpu.c b/tmp-test/test_sparsek_cpu.c deleted file mode 100644 index 8358e01f9d612..0000000000000 --- a/tmp-test/test_sparsek_cpu.c +++ /dev/null @@ -1,51 +0,0 @@ -#include "ggml.h" -#include -#include -#include // בשביל memcpy - -// הצהרה קדמית לפונקציה הישנה -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_tensor * tensor, int n_threads); - -int main() { - struct ggml_init_params params = { - .mem_size = 16*1024*1024, - .mem_buffer = NULL, - .no_alloc = false, - }; - - struct ggml_context * ctx = ggml_init(params); - - // טנזורים קטנים לבדיקה - int n = 2; - float q_data[4] = {1.0, 2.0, 3.0, 4.0}; - float k_data[4] = {1.0, 0.0, 0.0, 1.0}; - float v_data[4] = {5.0, 6.0, 7.0, 8.0}; - - struct ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); - struct ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); - struct ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); - - memcpy(Q->data, q_data, sizeof(q_data)); - memcpy(K->data, k_data, sizeof(k_data)); - memcpy(V->data, v_data, sizeof(v_data)); - - printf("Running ggml_sparsek_attn CPU test...\n"); - - // קריאה לפונקציה שלך - struct ggml_tensor * Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0); - - // חישוב - ggml_build_forward_expand(NULL, Y); - ggml_graph_compute(ctx, Y, 1); - - printf("SPARSEK CPU test finished successfully.\n"); - printf("Output tensor:\n"); - - for (int i = 0; i < n * n; ++i) { - printf("%.6f ", ((float *)Y->data)[i]); - } - printf("\n"); - - ggml_free(ctx); - return 0; -} From 612fdca9094dc0e9d161499deae3b84ff26be8e0 Mon Sep 17 00:00:00 2001 From: Gitty Burstein Date: Thu, 30 Oct 2025 10:35:16 +0200 Subject: [PATCH 05/15] fix SparseK CPU operator implementation Co-authored-by: Yael Co-authored-by: Tamar --- ggml/src/ggml-cpu/ops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 788b5e8954f75..8465f4553bd3b 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7916,7 +7916,7 @@ static void ggml_compute_forward_sparsek_attn_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - // Single-threaded baseline version (expand later for parallelism) + // Single-threaded baseline version if (params->ith != 0) return; const struct ggml_tensor * Q = dst->src[0]; From b0194f4235d830b200fe19e1b9a69935559dfad2 Mon Sep 17 00:00:00 2001 From: yael-works Date: Thu, 30 Oct 2025 10:37:56 +0200 Subject: [PATCH 06/15] trigger refresh From d02d937d1fbdc8f77c7d8f7f107daf302e5ac7e5 Mon Sep 17 00:00:00 2001 From: Gitty Burstein Date: Thu, 30 Oct 2025 12:48:03 +0200 Subject: [PATCH 07/15] test commit from Gitty --- gitty_test.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 gitty_test.txt diff --git a/gitty_test.txt b/gitty_test.txt new file mode 100644 index 0000000000000..bb89e2762756f --- /dev/null +++ b/gitty_test.txt @@ -0,0 +1 @@ +# test from Gitty From 5fa78a2b1fa2e8e9c63ec556ec6b093fb105ba77 Mon Sep 17 00:00:00 2001 From: Gitty Burstein Date: Thu, 30 Oct 2025 12:49:20 +0200 Subject: [PATCH 08/15] remove test file --- gitty_test.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 gitty_test.txt diff --git a/gitty_test.txt b/gitty_test.txt deleted file mode 100644 index bb89e2762756f..0000000000000 --- a/gitty_test.txt +++ /dev/null @@ -1 +0,0 @@ -# test from Gitty From b19c244036b22b093c12ec31a6090a31a31d8fe4 Mon Sep 17 00:00:00 2001 From: Gitty Burstein Date: Thu, 30 Oct 2025 13:35:08 +0200 Subject: [PATCH 09/15] feat: implement SparseK attention core logic Co-authored-by: Yael Co-authored-by: Gitty --- ggml/include/ggml.h | 6 +++--- ggml/src/ggml-cpu/ggml-cpu.c | 2 +- tests/test-backend-ops.cpp | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 25c8343fc3315..ad24f341bdd5d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2234,9 +2234,9 @@ extern "C" { GGML_API struct ggml_tensor * ggml_sparsek_attn( struct ggml_context * ctx, - struct ggml_tensor * Q, - struct ggml_tensor * K, - struct ggml_tensor * V, + struct ggml_tensor * Q, + struct ggml_tensor * K, + struct ggml_tensor * V, int32_t k_top, int32_t win_local, int32_t stride_global); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 275d1a22fd381..3fa954e1c324a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1955,7 +1955,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm case GGML_OP_SPARSEK_ATTN: { ggml_compute_forward_sparsek_attn(params, tensor); - } break; + } break; case GGML_OP_FLASH_ATTN_BACK: { int32_t t = ggml_get_op_params_i32(tensor, 0); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e899bb8c50168..fd78d4e06d2c7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7251,7 +7251,6 @@ static std::vector> make_test_cases_eval() { // Test cases for performance evaluation: should be representative of real-world use cases static std::vector> make_test_cases_perf() { std::vector> test_cases; - // Conv2d: K=CRS=NPQ=4096 matmul performance uint32_t iwh_idx = 0; uint32_t kwh_idx = 1; From 49c7e4b1947242198bfd2167ef960f23d7757082 Mon Sep 17 00:00:00 2001 From: yael-works Date: Thu, 30 Oct 2025 14:14:51 +0200 Subject: [PATCH 10/15] Implement final optimized SparseK Attention (CPU) Co-authored-by: Yael Co-authored-by: Gitty --- ggml/src/ggml-cpu/ops.cpp | 152 +++++++++++++++++++++++++++----------- 1 file changed, 110 insertions(+), 42 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 8465f4553bd3b..ce6169de3fb9b 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7909,14 +7909,30 @@ void ggml_compute_forward_argsort( } //------------------------------------------------------------------------------ -// SparseK Attention (CPU) +// SparseK Attention (CPU, final optimized version) //------------------------------------------------------------------------------ +// +// Implements SparseK Attention as a GGML operator for the CPU backend. +// Features: +// • Top-K filtering using nth_element (O(N)) +// • Optional local window (win_local) +// • Optional global stride (stride_glb) +// • Numerically stable softmax +// • Preallocated buffers for performance +// +// Author: Yael Shuker (yael-works) +//------------------------------------------------------------------------------ + +#include +#include +#include +#include static void ggml_compute_forward_sparsek_attn_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - // Single-threaded baseline version + // Single-threaded baseline version if (params->ith != 0) return; const struct ggml_tensor * Q = dst->src[0]; @@ -7929,80 +7945,132 @@ static void ggml_compute_forward_sparsek_attn_f32( GGML_ASSERT(V->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + // Operator parameters const int32_t k_top = ggml_get_op_params_i32(dst, 0); - const int32_t win_local = ggml_get_op_params_i32(dst, 1); - const int32_t stride_glb = ggml_get_op_params_i32(dst, 2); - GGML_UNUSED(win_local); - GGML_UNUSED(stride_glb); + const int32_t win_local = ggml_get_op_params_i32(dst, 1); // -1 ⇒ no local window + const int32_t stride_glb = ggml_get_op_params_i32(dst, 2); // ≤1 ⇒ no global stride + + const bool use_local = (win_local >= 0); + const bool use_stride = (stride_glb > 1); - // Tensor dimensions according to GGML layout: ne[0]=d, ne[1]=seq, ne[2]=head, ne[3]=batch + // GGML tensor dimensions: ne[0]=D, ne[1]=T, ne[2]=H, ne[3]=B const int64_t D = Q->ne[0]; const int64_t T = Q->ne[1]; const int64_t H = Q->ne[2]; const int64_t B = Q->ne[3]; - // Temporary buffer for attention scores for one query row - std::vector attn_row(T, 0.0f); + // Dimension validation + GGML_ASSERT(K->ne[0] == D && V->ne[0] == D); + GGML_ASSERT(K->ne[1] == T && V->ne[1] == T); + GGML_ASSERT(K->ne[2] == H && V->ne[2] == H); + GGML_ASSERT(K->ne[3] == B && V->ne[3] == B); + + // Parameter sanity checks + GGML_ASSERT(k_top >= 0 && k_top <= (int32_t)T); + GGML_ASSERT(win_local >= -1); + GGML_ASSERT(stride_glb >= 0); - const float scale = 1.0f / sqrtf((float) D); + const float scale = 1.0f / sqrtf((float)D); + const float NINF = -std::numeric_limits::infinity(); + + // Preallocated buffers to avoid heap churn + std::vector attn_row((size_t)T, NINF); + std::vector cand_idx; cand_idx.reserve((size_t)T); + std::vector scores; scores.reserve((size_t)T); - // Loops over batch, head, and query token for (int64_t b = 0; b < B; ++b) { for (int64_t h = 0; h < H; ++h) { for (int64_t iq = 0; iq < T; ++iq) { - // (1) Compute dot products Q·K within same (b,h) - const char * qbase = (const char *) Q->data + b*Q->nb[3] + h*Q->nb[2] + iq*Q->nb[1]; - const float * qv = (const float *) qbase; + // (0) Build candidate index list (always include self) + cand_idx.clear(); + scores.clear(); + + if (!use_local && !use_stride) { + // No sparsity: attend to all tokens + for (int64_t j = 0; j < T; ++j) + cand_idx.push_back((int32_t)j); + } else { + // Apply local window and/or global stride + for (int64_t j = 0; j < T; ++j) { + const int64_t dist = iq >= j ? iq - j : j - iq; + const bool pass_local = use_local && (dist <= (int64_t)win_local); + const bool pass_stride = use_stride && (stride_glb > 0 && j % stride_glb == 0); + if (pass_local || pass_stride || j == iq) + cand_idx.push_back((int32_t)j); + } + } + + // Edge case: no candidates or k_top==0 → output zeros + if (k_top == 0 || cand_idx.empty()) { + float * y0 = (float *)((char *)dst->data + b*dst->nb[3] + h*dst->nb[2] + iq*dst->nb[1]); + std::fill(y0, y0 + D, 0.0f); + continue; + } - for (int64_t j = 0; j < T; ++j) { - const char * kbase = (const char *) K->data + b*K->nb[3] + h*K->nb[2] + j*K->nb[1]; - const float * kv = (const float *) kbase; + // (1) Compute scaled dot-product Q·K only for candidates + std::fill(attn_row.begin(), attn_row.end(), NINF); + const float * qv = (const float *)((const char *)Q->data + b*Q->nb[3] + h*Q->nb[2] + iq*Q->nb[1]); + for (int32_t j : cand_idx) { + const float * kv = (const float *)((const char *)K->data + b*K->nb[3] + h*K->nb[2] + (int64_t)j*K->nb[1]); float dot = 0.0f; - for (int64_t d = 0; d < D; ++d) { + for (int64_t d = 0; d < D; ++d) dot += qv[d] * kv[d]; - } attn_row[j] = dot * scale; } - // (2) Select top-k threshold using nth_element - const int kk = std::max(1, std::min((int)T, k_top)); - std::vector tmp(attn_row.begin(), attn_row.end()); - std::nth_element(tmp.begin(), tmp.begin() + (kk - 1), tmp.end(), std::greater()); - const float thr = tmp[kk - 1]; + // (2) Determine true Top-K threshold using nth_element + const int num_candidates = (int)cand_idx.size(); + const int kk = std::min(std::max(1, k_top), num_candidates); + + if (kk < num_candidates) { + scores.resize((size_t)num_candidates); + for (size_t i = 0; i < cand_idx.size(); ++i) + scores[i] = attn_row[cand_idx[i]]; + + std::nth_element(scores.begin(), scores.begin() + (kk - 1), scores.end(), std::greater()); + const float thr = scores[kk - 1]; - for (int64_t j = 0; j < T; ++j) { - if (attn_row[j] < thr) attn_row[j] = -INFINITY; + // Mask all values below the threshold + for (int32_t j : cand_idx) + if (attn_row[j] < thr) attn_row[j] = NINF; } - // (3) Numerically stable softmax on the masked row - float maxv = -INFINITY; - for (int64_t j = 0; j < T; ++j) { + // (3) Numerically stable softmax + float maxv = NINF; + for (int32_t j : cand_idx) maxv = std::max(maxv, attn_row[j]); + + // Handle all-masked rows + if (!std::isfinite(maxv)) { + float * y0 = (float *)((char *)dst->data + b*dst->nb[3] + h*dst->nb[2] + iq*dst->nb[1]); + std::fill(y0, y0 + D, 0.0f); + continue; } + float sum = 0.0f; - for (int64_t j = 0; j < T; ++j) { - float v = attn_row[j] - maxv; - float e = expf(v); + for (int32_t j : cand_idx) { + if (attn_row[j] == NINF) continue; + const float e = expf(attn_row[j] - maxv); attn_row[j] = e; sum += e; } - const float inv_sum = sum > 0.0f ? 1.0f / sum : 0.0f; - for (int64_t j = 0; j < T; ++j) { + + const float inv_sum = (sum > 0.0f) ? (1.0f / sum) : 0.0f; + for (int32_t j : cand_idx) { + if (attn_row[j] == NINF) continue; attn_row[j] *= inv_sum; } - // (4) Compute output = A·V (weighted sum) - float * y = (float *) ((char *) dst->data + b*dst->nb[3] + h*dst->nb[2] + iq*dst->nb[1]); - + // (4) Compute output y = A·V + float * y = (float *)((char *)dst->data + b*dst->nb[3] + h*dst->nb[2] + iq*dst->nb[1]); for (int64_t d = 0; d < D; ++d) { float acc = 0.0f; - for (int64_t j = 0; j < T; ++j) { + for (int32_t j : cand_idx) { const float aij = attn_row[j]; - if (aij == 0.0f) continue; // skip masked entries - const char * vbase = (const char *) V->data + b*V->nb[3] + h*V->nb[2] + j*V->nb[1]; - const float * vv = (const float *) vbase; + if (!(aij > 0.0f)) continue; // skip zero or masked + const float * vv = (const float *)((const char *)V->data + b*V->nb[3] + h*V->nb[2] + (int64_t)j*V->nb[1]); acc += aij * vv[d]; } y[d] = acc; @@ -8012,7 +8080,7 @@ static void ggml_compute_forward_sparsek_attn_f32( } GGML_PRINT_DEBUG("[SPARSEK CPU] k_top=%d win_local=%d stride=%d\n", - k_top, win_local, stride_glb); + k_top, win_local, stride_glb); } void ggml_compute_forward_sparsek_attn( From 939bbd9d7166b1eebf68edb09c4e1e84bac3c486 Mon Sep 17 00:00:00 2001 From: Gitty Burstein Date: Thu, 30 Oct 2025 17:44:20 +0200 Subject: [PATCH 11/15] style: remove trailing whitespace and fix indentation in test-backend-ops.cpp Co-authored-by: Gitty Burstein Co-authored-by: Yael Shuker --- ggml/src/ggml-cpu/ops.cpp | 2 +- tests/test-backend-ops.cpp | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index ce6169de3fb9b..762d340761d03 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7920,7 +7920,7 @@ void ggml_compute_forward_argsort( // • Numerically stable softmax // • Preallocated buffers for performance // -// Author: Yael Shuker (yael-works) +// Author: Yael Shuker & Gitty Burstein //------------------------------------------------------------------------------ #include diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index fd78d4e06d2c7..8da7c730e3f74 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7208,7 +7208,7 @@ static std::vector> make_test_cases_eval() { continue; } for (bool with_bias : {false, true}) { - if (!with_gate && !with_bias) { + if (!with_gate && !with_bias) { continue; } for (ggml_glu_op glu_op : {GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU}) { @@ -7252,12 +7252,12 @@ static std::vector> make_test_cases_eval() { static std::vector> make_test_cases_perf() { std::vector> test_cases; // Conv2d: K=CRS=NPQ=4096 matmul performance - uint32_t iwh_idx = 0; - uint32_t kwh_idx = 1; - uint32_t Cout_idx = 2; - uint32_t Cin_idx = 3; - uint32_t B_idx = 4; - std::vector> cases = { + uint32_t iwh_idx = 0; + uint32_t kwh_idx = 1; + uint32_t Cout_idx = 2; + uint32_t Cin_idx = 3; + uint32_t B_idx = 4; + std::vector> cases = { //{IWH, KWH, Cout, Cin, B} // K=CRS=NPQ=4096 conv2d matmul performance {19, 4, 4096, 256, 16}, From 1983ab3e0464ee34f0d0e844105b6ca8be92e0f3 Mon Sep 17 00:00:00 2001 From: Gitty Burstein Date: Fri, 31 Oct 2025 01:37:25 +0200 Subject: [PATCH 12/15] delete Trailing whitespace Co-authored-by: Gitty Burstein Co-authored-by: Yael Shuker --- tests/test-backend-ops.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 8da7c730e3f74..eb69f2e669514 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7258,8 +7258,8 @@ static std::vector> make_test_cases_perf() { uint32_t Cin_idx = 3; uint32_t B_idx = 4; std::vector> cases = { - //{IWH, KWH, Cout, Cin, B} - // K=CRS=NPQ=4096 conv2d matmul performance +// {IWH, KWH, Cout, Cin, B} +// K=CRS=NPQ=4096 conv2d matmul performance {19, 4, 4096, 256, 16}, // K=128, CRS=128, NPQ=4096 { 19, 4, 128, 8, 16}, From 202c5d14115eee43c2f575db615b56b02ae1c2d3 Mon Sep 17 00:00:00 2001 From: GittyBurstein Date: Fri, 31 Oct 2025 01:56:19 +0200 Subject: [PATCH 13/15] Update tests/test-backend-ops.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- tests/test-backend-ops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index eb69f2e669514..839b29c44143d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7208,7 +7208,7 @@ static std::vector> make_test_cases_eval() { continue; } for (bool with_bias : {false, true}) { - if (!with_gate && !with_bias) { + if (!with_gate && !with_bias) { continue; } for (ggml_glu_op glu_op : {GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU}) { From 971296774c2c613188380d4264fa7f2e08e7f46e Mon Sep 17 00:00:00 2001 From: GittyBurstein Date: Fri, 31 Oct 2025 01:56:30 +0200 Subject: [PATCH 14/15] Update tests/test-backend-ops.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- tests/test-backend-ops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 839b29c44143d..937576023c956 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7135,7 +7135,7 @@ static std::vector> make_test_cases_eval() { if (hsk != 192 && hsk != 576 && hsk != hsv) continue; if (hsk == 192 && (hsv != 128 && hsv != 192)) continue; if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA - + for (bool mask : { true, false } ) { for (bool sinks : { true, false } ) { for (float max_bias : { 0.0f, 8.0f }) { From 77f4088b1220bd45a65668c0f0f2c35055c3cad5 Mon Sep 17 00:00:00 2001 From: GittyBurstein Date: Fri, 31 Oct 2025 01:56:42 +0200 Subject: [PATCH 15/15] Update tests/test-backend-ops.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- tests/test-backend-ops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 937576023c956..5350ea13e6ee6 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7178,7 +7178,7 @@ static std::vector> make_test_cases_eval() { for (int64_t d_qk : {64, 128}) { for (int64_t d_v : {64, 128}) { for (int64_t n_head : {4, 8}) { - for (int64_t kv : {113, 512}) { + for (int64_t kv : {113, 512}) { for (int64_t b : {1, 4}) { for (int32_t k_top : {16, 32}) { for (int32_t win_local : {32, 64}) {