From 66248d207c730525b33b39f2577cb2462804bc56 Mon Sep 17 00:00:00 2001
From: yael-works <yaelshuker100@gmail.com>
Date: Tue, 28 Oct 2025 11:25:39 +0200
Subject: [PATCH 01/15] Add skeleton for GGML_OP_SPARSEK_ATTN (SparseK
 Attention): new operator definition and tensor creation, backend
 implementation pending to ggml.c/h Co-authored-by: Yael Shuker
 <yaelshuker100@gmail.com> Co-authored-by: Gitty Burstein
 <g0534163997@gmail.com>

---
 ggml/include/ggml.h | 12 +++++++++++-
 ggml/src/ggml.c     | 46 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index d948b00cc7f30..c47c5404c9c3b 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -529,7 +529,7 @@ extern "C" {
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
         GGML_OP_LEAKY_RELU,
-
+        GGML_OP_SPARSEK_ATTN,
         GGML_OP_FLASH_ATTN_EXT,
         GGML_OP_FLASH_ATTN_BACK,
         GGML_OP_SSM_CONV,
@@ -2231,6 +2231,16 @@ extern "C" {
     //   n_head % ne32      == 0
     //   ne3    % ne33      == 0
     //
+
+    GGML_API struct ggml_tensor * ggml_sparsek_attn(
+              struct ggml_context * ctx,
+              struct ggml_tensor  * Q,   
+              struct ggml_tensor  * K,  
+              struct ggml_tensor  * V,   
+              int32_t               k_top,
+              int32_t               win_local,
+              int32_t               stride_global);
+
     GGML_API struct ggml_tensor * ggml_flash_attn_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * q,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 9be35c1be8456..6aec78051a3c8 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1019,7 +1019,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
+static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1094,7 +1094,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
     "leaky_relu(x)",
-
+    "sparsek_attn(Q, K, V, k_top, win_local, stride_global)",
     "flash_attn_ext(x)",
     "flash_attn_back(x)",
     "ssm_conv(x)",
@@ -1123,7 +1123,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x)",
 };
 
-static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
+static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -5063,6 +5063,46 @@ struct ggml_tensor * ggml_top_k(
     return result;
 }
 
+// ggml_sparsek_attn
+struct ggml_tensor * ggml_sparsek_attn(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * Q,
+        struct ggml_tensor  * K,
+        struct ggml_tensor  * V,
+        int32_t               k_top,
+        int32_t               win_local,
+        int32_t               stride_global) {
+
+    // ביטול אזהרות (אם טרם משתמשים בפרמטרים)
+    GGML_UNUSED(k_top);
+    GGML_UNUSED(win_local);
+    GGML_UNUSED(stride_global);
+
+    // בדיקות תקינות בסיסיות
+    GGML_ASSERT(Q != NULL);
+    GGML_ASSERT(K != NULL);
+    GGML_ASSERT(V != NULL);
+    GGML_ASSERT(ggml_can_mul_mat(K, Q));
+
+    // יצירת טנזור פלט בממדים המתאימים
+    int64_t ne[GGML_MAX_DIMS] = { V->ne[0], Q->ne[2], Q->ne[1], Q->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, ne);
+
+    // הגדרת סוג האופרטור והמקורות
+    result->op     = GGML_OP_SPARSEK_ATTN;
+    result->src[0] = Q;
+    result->src[1] = K;
+    result->src[2] = V;
+
+    // שמירת הפרמטרים המספריים במערך op_params (שיטה הנהוגה ב־ggml)
+    result->op_params[0] = k_top;
+    result->op_params[1] = win_local;
+    result->op_params[2] = stride_global;
+
+    return result;
+}
+
+
 // ggml_flash_attn_ext
 
 struct ggml_tensor * ggml_flash_attn_ext(

From 5d6d3b771a47bee4a316e5a197cb99b6af4131a0 Mon Sep 17 00:00:00 2001
From: yael-works <yaelshuker100@gmail.com>
Date: Tue, 28 Oct 2025 14:06:10 +0200
Subject: [PATCH 02/15] Add CPU support for SparseK Attention (without
 performance checks) Co-authored-by: Yael Shuker <yaelshuker100@gmail.com>
 Co-authored-by: Gitty Burstein <g0534163997@gmail.com>

---
 ggml/src/ggml-cpu/ggml-cpu.c |  5 +++
 ggml/src/ggml-cpu/ops.cpp    | 82 ++++++++++++++++++++++++++++++++++++
 ggml/src/ggml-cpu/ops.h      |  2 +
 ggml/src/ggml.c              | 46 +++++++++++---------
 tests/test-backend-ops.cpp   | 61 ++++++++++++++++++++++++++-
 5 files changed, 174 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 9ec485cfa2ff7..b43a2b437d8dc 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1952,6 +1952,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_flash_attn_ext(params, tensor);
             } break;
+        case GGML_OP_SPARSEK_ATTN:
+            {
+                ggml_compute_forward_sparsek_attn(params, tensor);
+                break;
+            }           
         case GGML_OP_FLASH_ATTN_BACK:
             {
                 int32_t t = ggml_get_op_params_i32(tensor, 0);
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 3156bd60101d7..5bc0cb3e298c7 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7907,6 +7907,88 @@ void ggml_compute_forward_argsort(
     }
 }
 
+//------------------------------------------------------------------------------
+// SparseK Attention (CPU)
+//------------------------------------------------------------------------------
+
+static void ggml_compute_forward_sparsek_attn_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    if (params->ith != 0) return; // main thread only
+
+    const struct ggml_tensor * Q = dst->src[0];
+    const struct ggml_tensor * K = dst->src[1];
+    const struct ggml_tensor * V = dst->src[2];
+
+    GGML_ASSERT(Q && K && V);
+    GGML_ASSERT(Q->type == GGML_TYPE_F32);
+    GGML_ASSERT(K->type == GGML_TYPE_F32);
+    GGML_ASSERT(V->type == GGML_TYPE_F32);
+
+    const int32_t k_top      = ggml_get_op_params_i32(dst, 0);
+    const int32_t win_local  = ggml_get_op_params_i32(dst, 1);
+    const int32_t stride_glb = ggml_get_op_params_i32(dst, 2);
+
+    const int64_t D = Q->ne[0];   // embedding dim
+    const int64_t T = Q->ne[1];   // sequence length
+
+    const float * q = (const float *) Q->data;
+    const float * k = (const float *) K->data;
+    const float * v = (const float *) V->data;
+    float * out     = (float *) dst->data;
+
+    
+    for (int64_t i = 0; i < T; ++i) {
+        for (int64_t j = 0; j < T; ++j) {
+            float dot = 0.0f;
+            for (int64_t d = 0; d < D; ++d)
+                dot += q[i*D + d] * k[j*D + d];
+            out[i*T + j] = dot / sqrtf((float) D);
+        }
+    }
+
+    for (int64_t i = 0; i < T; ++i) {
+        float * row = &out[i*T];
+        for (int64_t j = 0; j < T; ++j)
+            if (row[j] < row[k_top]) row[j] = -INFINITY;
+    }
+
+    for (int64_t i = 0; i < T; ++i) {
+        float maxv = -INFINITY;
+        for (int64_t j = 0; j < T; ++j)
+            if (out[i*T + j] > maxv) maxv = out[i*T + j];
+        float sum = 0.0f;
+        for (int64_t j = 0; j < T; ++j) {
+            out[i*T + j] = expf(out[i*T + j] - maxv);
+            sum += out[i*T + j];
+        }
+        for (int64_t j = 0; j < T; ++j)
+            out[i*T + j] /= sum;
+    }
+
+
+    float * result = (float *) dst->data;
+    for (int64_t i = 0; i < T; ++i) {
+        for (int64_t d = 0; d < D; ++d) {
+            float sum = 0.0f;
+            for (int64_t j = 0; j < T; ++j)
+                sum += out[i*T + j] * v[j*D + d];
+            result[i*D + d] = sum;
+        }
+    }
+
+    GGML_PRINT_DEBUG("[SPARSEK CPU] k_top=%d win_local=%d stride=%d\n",
+        k_top, win_local, stride_glb);
+}
+
+void ggml_compute_forward_sparsek_attn(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+    ggml_compute_forward_sparsek_attn_f32(params, dst);
+}
+
+
 // ggml_compute_forward_flash_attn_ext
 
 static void ggml_compute_forward_flash_attn_ext_f16(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 9824a03b45833..e43b23a5587bd 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -86,6 +86,8 @@ void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params *
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sparsek_attn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
 void ggml_compute_forward_flash_attn_back(
         const struct ggml_compute_params * params,
         const bool masked,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 6aec78051a3c8..9ad055c994672 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -990,7 +990,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
     "LEAKY_RELU",
-
+    "SPARSEK_ATTN",
     "FLASH_ATTN_EXT",
     "FLASH_ATTN_BACK",
     "SSM_CONV",
@@ -1094,7 +1094,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
     "leaky_relu(x)",
-    "sparsek_attn(Q, K, V, k_top, win_local, stride_global)",
+    "sparsek_attn(x)",
     "flash_attn_ext(x)",
     "flash_attn_back(x)",
     "ssm_conv(x)",
@@ -5073,36 +5073,42 @@ struct ggml_tensor * ggml_sparsek_attn(
         int32_t               win_local,
         int32_t               stride_global) {
 
-    // ביטול אזהרות (אם טרם משתמשים בפרמטרים)
-    GGML_UNUSED(k_top);
-    GGML_UNUSED(win_local);
-    GGML_UNUSED(stride_global);
-
-    // בדיקות תקינות בסיסיות
-    GGML_ASSERT(Q != NULL);
-    GGML_ASSERT(K != NULL);
-    GGML_ASSERT(V != NULL);
     GGML_ASSERT(ggml_can_mul_mat(K, Q));
+    GGML_ASSERT(Q->ne[3] == K->ne[3] && Q->ne[3] == V->ne[3]);
+
+    int64_t ne[4] = { V->ne[0], Q->ne[2], Q->ne[1], Q->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
 
-    // יצירת טנזור פלט בממדים המתאימים
-    int64_t ne[GGML_MAX_DIMS] = { V->ne[0], Q->ne[2], Q->ne[1], Q->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, ne);
+    int32_t params_i32[3] = { k_top, win_local, stride_global };
+    ggml_set_op_params(result, params_i32, sizeof(params_i32));
 
-    // הגדרת סוג האופרטור והמקורות
     result->op     = GGML_OP_SPARSEK_ATTN;
     result->src[0] = Q;
     result->src[1] = K;
     result->src[2] = V;
 
-    // שמירת הפרמטרים המספריים במערך op_params (שיטה הנהוגה ב־ggml)
-    result->op_params[0] = k_top;
-    result->op_params[1] = win_local;
-    result->op_params[2] = stride_global;
-
     return result;
 }
 
 
+void ggml_sparsek_attn_set_params(struct ggml_tensor * a,
+                                  int32_t k_top,
+                                  int32_t win_local,
+                                  int32_t stride_global) {
+    GGML_ASSERT(a->op == GGML_OP_SPARSEK_ATTN);
+    ggml_set_op_params_i32(a, 0, k_top);
+    ggml_set_op_params_i32(a, 1, win_local);
+    ggml_set_op_params_i32(a, 2, stride_global);
+}
+
+int32_t ggml_sparsek_attn_get_param(const struct ggml_tensor * a, int index) {
+    GGML_ASSERT(a->op == GGML_OP_SPARSEK_ATTN);
+    return ggml_get_op_params_i32(a, index);
+}
+
+
+
 // ggml_flash_attn_ext
 
 struct ggml_tensor * ggml_flash_attn_ext(
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index aee1730137900..e899bb8c50168 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1778,6 +1778,7 @@ struct test_example : public test_case {
 };
 
 
+
 // GGML_OP_UNARY
 struct test_unary : public test_case {
     const ggml_unary_op op;
@@ -5362,7 +5363,46 @@ struct test_leaky_relu : public test_case {
     }
 };
 
-// GGML_OP_FLASH_ATTN_EXT
+// GGML_OP_SPARSEK_ATTN
+struct test_sparsek_attn : public test_case {
+    const int64_t d_qk;
+    const int64_t d_v;
+    const int64_t n_head;
+    const int64_t n_tokens;
+    const int64_t batch;
+    const int32_t k_top;
+    const int32_t win_local;
+    const int32_t stride_global;
+
+    std::string vars() override {
+        return VARS_TO_STR9(d_qk, d_v, n_head, n_tokens, batch, k_top, win_local, stride_global, 0);
+    }
+
+    test_sparsek_attn(int64_t d_qk = 128, int64_t d_v = 128, int64_t n_head = 8,
+                      int64_t n_tokens = 256, int64_t batch = 4,
+                      int32_t k_top = 32, int32_t win_local = 64, int32_t stride_global = 128)
+        : d_qk(d_qk), d_v(d_v), n_head(n_head), n_tokens(n_tokens), batch(batch),
+          k_top(k_top), win_local(win_local), stride_global(stride_global) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_q = n_tokens;
+        ggml_tensor * Q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_qk, n_q, n_head, batch);
+        ggml_set_name(Q, "Q");
+        ggml_tensor * K = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_qk, n_tokens, n_head, batch);
+        ggml_set_name(K, "K");
+        ggml_tensor * V = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_v, n_tokens, n_head, batch);
+        ggml_set_name(V, "V");
+
+        ggml_tensor * out = ggml_sparsek_attn(ctx, Q, K, V, k_top, win_local, stride_global);
+        ggml_set_name(out, "SPARSEK_ATTN_out");
+
+        return out;
+    }
+};
+
+
+
+// GGML_OP_FLAsH_ATTN_EXT
 struct test_flash_attn_ext : public test_case {
     const int64_t hsk; // K head size
     const int64_t hsv; // V head size
@@ -7095,7 +7135,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
             if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
             if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
             if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA
-
+          
             for (bool mask : { true, false } ) {
                 for (bool sinks : { true, false } ) {
                     for (float max_bias : { 0.0f, 8.0f }) {
@@ -7134,6 +7174,23 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
             }
         }
     }
+    // ---- SPARSEK_ATTN --------------------------------------------------
+    for (int64_t d_qk : {64, 128}) {
+        for (int64_t d_v : {64, 128}) {
+            for (int64_t n_head : {4, 8}) {
+                for (int64_t kv : {113, 512}) {  
+                    for (int64_t b : {1, 4}) {
+                        for (int32_t k_top : {16, 32}) {
+                            for (int32_t win_local : {32, 64}) {
+                                test_cases.emplace_back(new test_sparsek_attn(
+                                    d_qk, d_v, n_head, kv, b, k_top, win_local, /*stride_global*/128));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
 
     test_cases.emplace_back(new test_cross_entropy_loss     (GGML_TYPE_F32, {   10, 5, 4, 3}));
     test_cases.emplace_back(new test_cross_entropy_loss     (GGML_TYPE_F32, {30000, 1, 1, 1}));

From a5daf2fede36aa1581a429bd070aa0a1206edea2 Mon Sep 17 00:00:00 2001
From: yael-works <yaelshuker100@gmail.com>
Date: Wed, 29 Oct 2025 12:47:25 +0200
Subject: [PATCH 03/15] fix: add missing prototypes for
 ggml_sparsek_attn_set/get_params in ggml.h Co-authored-by: Yael Shuker
 <yaelshuker100@gmail.com> Co-authored-by: Gitty Burstein
 <g0534163997@gmail.com>

---
 ggml/include/ggml.h           | 10 +++++++
 ggml/tests/test_sparsek_cpu.c | 40 +++++++++++++++++++++++++++
 tests/test_sparsek_cpu.c      | 50 ++++++++++++++++++++++++++++++++++
 tmp-test/test_sparsek_cpu.c   | 51 +++++++++++++++++++++++++++++++++++
 4 files changed, 151 insertions(+)
 create mode 100644 ggml/tests/test_sparsek_cpu.c
 create mode 100644 tests/test_sparsek_cpu.c
 create mode 100644 tmp-test/test_sparsek_cpu.c

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index c47c5404c9c3b..25c8343fc3315 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2241,6 +2241,16 @@ extern "C" {
               int32_t               win_local,
               int32_t               stride_global);
 
+    GGML_API void ggml_sparsek_attn_set_params(
+             struct ggml_tensor * a,
+             int32_t k_top,
+             int32_t win_local,
+             int32_t stride_global);
+
+    GGML_API int32_t ggml_sparsek_attn_get_param(
+             const struct ggml_tensor * a,
+             int index);
+
     GGML_API struct ggml_tensor * ggml_flash_attn_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * q,
diff --git a/ggml/tests/test_sparsek_cpu.c b/ggml/tests/test_sparsek_cpu.c
new file mode 100644
index 0000000000000..9cc82681d9356
--- /dev/null
+++ b/ggml/tests/test_sparsek_cpu.c
@@ -0,0 +1,40 @@
+#include "ggml.h"
+#include <stdio.h>
+#include <math.h>
+
+int main() {
+    struct ggml_init_params params = {
+        .mem_size   = 16*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+    struct ggml_context *ctx = ggml_init(params);
+
+    // יצירת טנזורים קטנים לבדיקה
+    int n = 2;
+    float q_data[4] = {1.0, 2.0, 3.0, 4.0};
+    float k_data[4] = {1.0, 0.0, 0.0, 1.0};
+    float v_data[4] = {5.0, 6.0, 7.0, 8.0};
+
+    struct ggml_tensor *Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
+    struct ggml_tensor *K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
+    struct ggml_tensor *V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
+
+    memcpy(Q->data, q_data, sizeof(q_data));
+    memcpy(K->data, k_data, sizeof(k_data));
+    memcpy(V->data, v_data, sizeof(v_data));
+
+    printf("Running ggml_sparsek_attn CPU test...\n");
+    struct ggml_tensor *Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0);
+
+    ggml_build_forward_expand(NULL, Y);
+    ggml_graph_compute_with_ctx(ctx, NULL, 1);
+
+    printf("Output tensor:\n");
+    for (int i = 0; i < n*n; ++i)
+        printf("%.6f ", ((float*)Y->data)[i]);
+    printf("\n");
+
+    ggml_free(ctx);
+    return 0;
+}
diff --git a/tests/test_sparsek_cpu.c b/tests/test_sparsek_cpu.c
new file mode 100644
index 0000000000000..0f6c082ed2f31
--- /dev/null
+++ b/tests/test_sparsek_cpu.c
@@ -0,0 +1,50 @@
+#define GGML_USE_DEFAULT_BACKEND 1
+#include "ggml.h"
+#include <stdio.h>
+#include <math.h>
+#include <string.h> // memcpy
+
+// הצהרה קדמית לפונקציה הישנה של ggml
+void ggml_graph_compute(struct ggml_context * ctx, struct ggml_tensor * tensor, int n_threads);
+
+int main() {
+    struct ggml_init_params params = {
+        .mem_size   = 16 * 1024 * 1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    // ניצור טנזורים קטנים לבדיקה
+    int n = 2;
+    float q_data[4] = {1.0, 2.0, 3.0, 4.0};
+    float k_data[4] = {1.0, 0.0, 0.0, 1.0};
+    float v_data[4] = {5.0, 6.0, 7.0, 8.0};
+
+    struct ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
+    struct ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
+    struct ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
+
+    memcpy(Q->data, q_data, sizeof(q_data));
+    memcpy(K->data, k_data, sizeof(k_data));
+    memcpy(V->data, v_data, sizeof(v_data));
+
+    printf("Running ggml_sparsek_attn CPU test...\n");
+
+    struct ggml_tensor * Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0);
+
+    ggml_build_forward_expand(NULL, Y);
+    ggml_graph_compute(ctx, Y, 1);
+
+    printf("SPARSEK CPU test finished successfully.\n");
+    printf("Output tensor:\n");
+
+    for (int i = 0; i < n * n; ++i) {
+        printf("%.6f ", ((float *)Y->data)[i]);
+    }
+    printf("\n");
+
+    ggml_free(ctx);
+    return 0;
+}
diff --git a/tmp-test/test_sparsek_cpu.c b/tmp-test/test_sparsek_cpu.c
new file mode 100644
index 0000000000000..8358e01f9d612
--- /dev/null
+++ b/tmp-test/test_sparsek_cpu.c
@@ -0,0 +1,51 @@
+#include "ggml.h"
+#include <stdio.h>
+#include <math.h>
+#include <string.h>  // בשביל memcpy
+
+// הצהרה קדמית לפונקציה הישנה
+void ggml_graph_compute(struct ggml_context * ctx, struct ggml_tensor * tensor, int n_threads);
+
+int main() {
+    struct ggml_init_params params = {
+        .mem_size   = 16*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    // טנזורים קטנים לבדיקה
+    int n = 2;
+    float q_data[4] = {1.0, 2.0, 3.0, 4.0};
+    float k_data[4] = {1.0, 0.0, 0.0, 1.0};
+    float v_data[4] = {5.0, 6.0, 7.0, 8.0};
+
+    struct ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
+    struct ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
+    struct ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
+
+    memcpy(Q->data, q_data, sizeof(q_data));
+    memcpy(K->data, k_data, sizeof(k_data));
+    memcpy(V->data, v_data, sizeof(v_data));
+
+    printf("Running ggml_sparsek_attn CPU test...\n");
+
+    // קריאה לפונקציה שלך
+    struct ggml_tensor * Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0);
+
+    // חישוב
+    ggml_build_forward_expand(NULL, Y);
+    ggml_graph_compute(ctx, Y, 1);
+
+    printf("SPARSEK CPU test finished successfully.\n");
+    printf("Output tensor:\n");
+
+    for (int i = 0; i < n * n; ++i) {
+        printf("%.6f ", ((float *)Y->data)[i]);
+    }
+    printf("\n");
+
+    ggml_free(ctx);
+    return 0;
+}

From 39a117f95a189e369669c2b8d9d9a07ca24902e9 Mon Sep 17 00:00:00 2001
From: Gitty Burstein <gitty@example.com>
Date: Thu, 30 Oct 2025 09:39:13 +0200
Subject: [PATCH 04/15] fix SparseK CPU operator implementation

Co-authored-by: Yael Shuker <yaelshuker100@gmail.com>
Co-authored-by: Gitty Burstein <g0534163997@gmail.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c  |   3 +-
 ggml/src/ggml-cpu/ops.cpp     | 121 ++++++++++++++++++++++------------
 ggml/tests/test_sparsek_cpu.c |  40 -----------
 tests/test_sparsek_cpu.c      |  50 --------------
 tmp-test/test_sparsek_cpu.c   |  51 --------------
 5 files changed, 81 insertions(+), 184 deletions(-)
 delete mode 100644 ggml/tests/test_sparsek_cpu.c
 delete mode 100644 tests/test_sparsek_cpu.c
 delete mode 100644 tmp-test/test_sparsek_cpu.c

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b43a2b437d8dc..275d1a22fd381 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1955,8 +1955,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         case GGML_OP_SPARSEK_ATTN:
             {
                 ggml_compute_forward_sparsek_attn(params, tensor);
-                break;
-            }           
+            } break;          
         case GGML_OP_FLASH_ATTN_BACK:
             {
                 int32_t t = ggml_get_op_params_i32(tensor, 0);
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 5bc0cb3e298c7..788b5e8954f75 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -9,6 +9,7 @@
 
 #include <float.h>
 #include <algorithm>
+#include <vector>
 
 // ggml_compute_forward_dup
 
@@ -7915,7 +7916,8 @@ static void ggml_compute_forward_sparsek_attn_f32(
     const struct ggml_compute_params * params,
     struct ggml_tensor * dst) {
 
-    if (params->ith != 0) return; // main thread only
+    // Single-threaded baseline version (expand later for parallelism)
+    if (params->ith != 0) return;
 
     const struct ggml_tensor * Q = dst->src[0];
     const struct ggml_tensor * K = dst->src[1];
@@ -7925,56 +7927,87 @@ static void ggml_compute_forward_sparsek_attn_f32(
     GGML_ASSERT(Q->type == GGML_TYPE_F32);
     GGML_ASSERT(K->type == GGML_TYPE_F32);
     GGML_ASSERT(V->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
     const int32_t k_top      = ggml_get_op_params_i32(dst, 0);
     const int32_t win_local  = ggml_get_op_params_i32(dst, 1);
     const int32_t stride_glb = ggml_get_op_params_i32(dst, 2);
+    GGML_UNUSED(win_local);
+    GGML_UNUSED(stride_glb);
 
-    const int64_t D = Q->ne[0];   // embedding dim
-    const int64_t T = Q->ne[1];   // sequence length
+    // Tensor dimensions according to GGML layout: ne[0]=d, ne[1]=seq, ne[2]=head, ne[3]=batch
+    const int64_t D = Q->ne[0];
+    const int64_t T = Q->ne[1];
+    const int64_t H = Q->ne[2];
+    const int64_t B = Q->ne[3];
 
-    const float * q = (const float *) Q->data;
-    const float * k = (const float *) K->data;
-    const float * v = (const float *) V->data;
-    float * out     = (float *) dst->data;
+    // Temporary buffer for attention scores for one query row
+    std::vector<float> attn_row(T, 0.0f);
 
-    
-    for (int64_t i = 0; i < T; ++i) {
-        for (int64_t j = 0; j < T; ++j) {
-            float dot = 0.0f;
-            for (int64_t d = 0; d < D; ++d)
-                dot += q[i*D + d] * k[j*D + d];
-            out[i*T + j] = dot / sqrtf((float) D);
-        }
-    }
+    const float scale = 1.0f / sqrtf((float) D);
 
-    for (int64_t i = 0; i < T; ++i) {
-        float * row = &out[i*T];
-        for (int64_t j = 0; j < T; ++j)
-            if (row[j] < row[k_top]) row[j] = -INFINITY;
-    }
+    // Loops over batch, head, and query token
+    for (int64_t b = 0; b < B; ++b) {
+        for (int64_t h = 0; h < H; ++h) {
+            for (int64_t iq = 0; iq < T; ++iq) {
 
-    for (int64_t i = 0; i < T; ++i) {
-        float maxv = -INFINITY;
-        for (int64_t j = 0; j < T; ++j)
-            if (out[i*T + j] > maxv) maxv = out[i*T + j];
-        float sum = 0.0f;
-        for (int64_t j = 0; j < T; ++j) {
-            out[i*T + j] = expf(out[i*T + j] - maxv);
-            sum += out[i*T + j];
-        }
-        for (int64_t j = 0; j < T; ++j)
-            out[i*T + j] /= sum;
-    }
+                // (1) Compute dot products Q·K within same (b,h)
+                const char * qbase = (const char *) Q->data + b*Q->nb[3] + h*Q->nb[2] + iq*Q->nb[1];
+                const float * qv = (const float *) qbase;
 
+                for (int64_t j = 0; j < T; ++j) {
+                    const char * kbase = (const char *) K->data + b*K->nb[3] + h*K->nb[2] + j*K->nb[1];
+                    const float * kv = (const float *) kbase;
 
-    float * result = (float *) dst->data;
-    for (int64_t i = 0; i < T; ++i) {
-        for (int64_t d = 0; d < D; ++d) {
-            float sum = 0.0f;
-            for (int64_t j = 0; j < T; ++j)
-                sum += out[i*T + j] * v[j*D + d];
-            result[i*D + d] = sum;
+                    float dot = 0.0f;
+                    for (int64_t d = 0; d < D; ++d) {
+                        dot += qv[d] * kv[d];
+                    }
+                    attn_row[j] = dot * scale;
+                }
+
+                // (2) Select top-k threshold using nth_element
+                const int kk = std::max<int>(1, std::min<int>((int)T, k_top));
+                std::vector<float> tmp(attn_row.begin(), attn_row.end());
+                std::nth_element(tmp.begin(), tmp.begin() + (kk - 1), tmp.end(), std::greater<float>());
+                const float thr = tmp[kk - 1];
+
+                for (int64_t j = 0; j < T; ++j) {
+                    if (attn_row[j] < thr) attn_row[j] = -INFINITY;
+                }
+
+                // (3) Numerically stable softmax on the masked row
+                float maxv = -INFINITY;
+                for (int64_t j = 0; j < T; ++j) {
+                    maxv = std::max(maxv, attn_row[j]);
+                }
+                float sum = 0.0f;
+                for (int64_t j = 0; j < T; ++j) {
+                    float v = attn_row[j] - maxv;
+                    float e = expf(v);
+                    attn_row[j] = e;
+                    sum += e;
+                }
+                const float inv_sum = sum > 0.0f ? 1.0f / sum : 0.0f;
+                for (int64_t j = 0; j < T; ++j) {
+                    attn_row[j] *= inv_sum;
+                }
+
+                // (4) Compute output = A·V (weighted sum)
+                float * y = (float *) ((char *) dst->data + b*dst->nb[3] + h*dst->nb[2] + iq*dst->nb[1]);
+
+                for (int64_t d = 0; d < D; ++d) {
+                    float acc = 0.0f;
+                    for (int64_t j = 0; j < T; ++j) {
+                        const float aij = attn_row[j];
+                        if (aij == 0.0f) continue; // skip masked entries
+                        const char * vbase = (const char *) V->data + b*V->nb[3] + h*V->nb[2] + j*V->nb[1];
+                        const float * vv = (const float *) vbase;
+                        acc += aij * vv[d];
+                    }
+                    y[d] = acc;
+                }
+            }
         }
     }
 
@@ -7985,7 +8018,13 @@ static void ggml_compute_forward_sparsek_attn_f32(
 void ggml_compute_forward_sparsek_attn(
     const struct ggml_compute_params * params,
     struct ggml_tensor * dst) {
-    ggml_compute_forward_sparsek_attn_f32(params, dst);
+    switch (dst->type) {
+        case GGML_TYPE_F32:
+            ggml_compute_forward_sparsek_attn_f32(params, dst);
+            break;
+        default:
+            GGML_ASSERT(false && "sparsek_attn: unsupported dst type");
+    }
 }
 
 
diff --git a/ggml/tests/test_sparsek_cpu.c b/ggml/tests/test_sparsek_cpu.c
deleted file mode 100644
index 9cc82681d9356..0000000000000
--- a/ggml/tests/test_sparsek_cpu.c
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "ggml.h"
-#include <stdio.h>
-#include <math.h>
-
-int main() {
-    struct ggml_init_params params = {
-        .mem_size   = 16*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-    struct ggml_context *ctx = ggml_init(params);
-
-    // יצירת טנזורים קטנים לבדיקה
-    int n = 2;
-    float q_data[4] = {1.0, 2.0, 3.0, 4.0};
-    float k_data[4] = {1.0, 0.0, 0.0, 1.0};
-    float v_data[4] = {5.0, 6.0, 7.0, 8.0};
-
-    struct ggml_tensor *Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
-    struct ggml_tensor *K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
-    struct ggml_tensor *V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
-
-    memcpy(Q->data, q_data, sizeof(q_data));
-    memcpy(K->data, k_data, sizeof(k_data));
-    memcpy(V->data, v_data, sizeof(v_data));
-
-    printf("Running ggml_sparsek_attn CPU test...\n");
-    struct ggml_tensor *Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0);
-
-    ggml_build_forward_expand(NULL, Y);
-    ggml_graph_compute_with_ctx(ctx, NULL, 1);
-
-    printf("Output tensor:\n");
-    for (int i = 0; i < n*n; ++i)
-        printf("%.6f ", ((float*)Y->data)[i]);
-    printf("\n");
-
-    ggml_free(ctx);
-    return 0;
-}
diff --git a/tests/test_sparsek_cpu.c b/tests/test_sparsek_cpu.c
deleted file mode 100644
index 0f6c082ed2f31..0000000000000
--- a/tests/test_sparsek_cpu.c
+++ /dev/null
@@ -1,50 +0,0 @@
-#define GGML_USE_DEFAULT_BACKEND 1
-#include "ggml.h"
-#include <stdio.h>
-#include <math.h>
-#include <string.h> // memcpy
-
-// הצהרה קדמית לפונקציה הישנה של ggml
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_tensor * tensor, int n_threads);
-
-int main() {
-    struct ggml_init_params params = {
-        .mem_size   = 16 * 1024 * 1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-
-    struct ggml_context * ctx = ggml_init(params);
-
-    // ניצור טנזורים קטנים לבדיקה
-    int n = 2;
-    float q_data[4] = {1.0, 2.0, 3.0, 4.0};
-    float k_data[4] = {1.0, 0.0, 0.0, 1.0};
-    float v_data[4] = {5.0, 6.0, 7.0, 8.0};
-
-    struct ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
-    struct ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
-    struct ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
-
-    memcpy(Q->data, q_data, sizeof(q_data));
-    memcpy(K->data, k_data, sizeof(k_data));
-    memcpy(V->data, v_data, sizeof(v_data));
-
-    printf("Running ggml_sparsek_attn CPU test...\n");
-
-    struct ggml_tensor * Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0);
-
-    ggml_build_forward_expand(NULL, Y);
-    ggml_graph_compute(ctx, Y, 1);
-
-    printf("SPARSEK CPU test finished successfully.\n");
-    printf("Output tensor:\n");
-
-    for (int i = 0; i < n * n; ++i) {
-        printf("%.6f ", ((float *)Y->data)[i]);
-    }
-    printf("\n");
-
-    ggml_free(ctx);
-    return 0;
-}
diff --git a/tmp-test/test_sparsek_cpu.c b/tmp-test/test_sparsek_cpu.c
deleted file mode 100644
index 8358e01f9d612..0000000000000
--- a/tmp-test/test_sparsek_cpu.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "ggml.h"
-#include <stdio.h>
-#include <math.h>
-#include <string.h>  // בשביל memcpy
-
-// הצהרה קדמית לפונקציה הישנה
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_tensor * tensor, int n_threads);
-
-int main() {
-    struct ggml_init_params params = {
-        .mem_size   = 16*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-
-    struct ggml_context * ctx = ggml_init(params);
-
-    // טנזורים קטנים לבדיקה
-    int n = 2;
-    float q_data[4] = {1.0, 2.0, 3.0, 4.0};
-    float k_data[4] = {1.0, 0.0, 0.0, 1.0};
-    float v_data[4] = {5.0, 6.0, 7.0, 8.0};
-
-    struct ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
-    struct ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
-    struct ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
-
-    memcpy(Q->data, q_data, sizeof(q_data));
-    memcpy(K->data, k_data, sizeof(k_data));
-    memcpy(V->data, v_data, sizeof(v_data));
-
-    printf("Running ggml_sparsek_attn CPU test...\n");
-
-    // קריאה לפונקציה שלך
-    struct ggml_tensor * Y = ggml_sparsek_attn(ctx, Q, K, V, 1, 0, 0);
-
-    // חישוב
-    ggml_build_forward_expand(NULL, Y);
-    ggml_graph_compute(ctx, Y, 1);
-
-    printf("SPARSEK CPU test finished successfully.\n");
-    printf("Output tensor:\n");
-
-    for (int i = 0; i < n * n; ++i) {
-        printf("%.6f ", ((float *)Y->data)[i]);
-    }
-    printf("\n");
-
-    ggml_free(ctx);
-    return 0;
-}

From 612fdca9094dc0e9d161499deae3b84ff26be8e0 Mon Sep 17 00:00:00 2001
From: Gitty Burstein <gitty@example.com>
Date: Thu, 30 Oct 2025 10:35:16 +0200
Subject: [PATCH 05/15] fix SparseK CPU operator implementation

Co-authored-by: Yael <yael@example.com>
Co-authored-by: Tamar <tamar@example.com>
---
 ggml/src/ggml-cpu/ops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 788b5e8954f75..8465f4553bd3b 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7916,7 +7916,7 @@ static void ggml_compute_forward_sparsek_attn_f32(
     const struct ggml_compute_params * params,
     struct ggml_tensor * dst) {
 
-    // Single-threaded baseline version (expand later for parallelism)
+    // Single-threaded baseline version 
     if (params->ith != 0) return;
 
     const struct ggml_tensor * Q = dst->src[0];

From b0194f4235d830b200fe19e1b9a69935559dfad2 Mon Sep 17 00:00:00 2001
From: yael-works <yaelshuker100@gmail.com>
Date: Thu, 30 Oct 2025 10:37:56 +0200
Subject: [PATCH 06/15] trigger refresh


From d02d937d1fbdc8f77c7d8f7f107daf302e5ac7e5 Mon Sep 17 00:00:00 2001
From: Gitty Burstein <gitty@example.com>
Date: Thu, 30 Oct 2025 12:48:03 +0200
Subject: [PATCH 07/15] test commit from Gitty

---
 gitty_test.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 gitty_test.txt

diff --git a/gitty_test.txt b/gitty_test.txt
new file mode 100644
index 0000000000000..bb89e2762756f
--- /dev/null
+++ b/gitty_test.txt
@@ -0,0 +1 @@
+# test from Gitty

From 5fa78a2b1fa2e8e9c63ec556ec6b093fb105ba77 Mon Sep 17 00:00:00 2001
From: Gitty Burstein <gitty@example.com>
Date: Thu, 30 Oct 2025 12:49:20 +0200
Subject: [PATCH 08/15] remove test file

---
 gitty_test.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 gitty_test.txt

diff --git a/gitty_test.txt b/gitty_test.txt
deleted file mode 100644
index bb89e2762756f..0000000000000
--- a/gitty_test.txt
+++ /dev/null
@@ -1 +0,0 @@
-# test from Gitty

From b19c244036b22b093c12ec31a6090a31a31d8fe4 Mon Sep 17 00:00:00 2001
From: Gitty Burstein <gitty@example.com>
Date: Thu, 30 Oct 2025 13:35:08 +0200
Subject: [PATCH 09/15] feat: implement SparseK attention core logic

Co-authored-by: Yael <yaelshuker100@gmail.com>
Co-authored-by: Gitty <g0534163997@gmail.com>
---
 ggml/include/ggml.h          | 6 +++---
 ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
 tests/test-backend-ops.cpp   | 1 -
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 25c8343fc3315..ad24f341bdd5d 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2234,9 +2234,9 @@ extern "C" {
 
     GGML_API struct ggml_tensor * ggml_sparsek_attn(
               struct ggml_context * ctx,
-              struct ggml_tensor  * Q,   
-              struct ggml_tensor  * K,  
-              struct ggml_tensor  * V,   
+              struct ggml_tensor  * Q,
+              struct ggml_tensor  * K,
+              struct ggml_tensor  * V,
               int32_t               k_top,
               int32_t               win_local,
               int32_t               stride_global);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 275d1a22fd381..3fa954e1c324a 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1955,7 +1955,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         case GGML_OP_SPARSEK_ATTN:
             {
                 ggml_compute_forward_sparsek_attn(params, tensor);
-            } break;          
+            } break;
         case GGML_OP_FLASH_ATTN_BACK:
             {
                 int32_t t = ggml_get_op_params_i32(tensor, 0);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index e899bb8c50168..fd78d4e06d2c7 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7251,7 +7251,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 // Test cases for performance evaluation: should be representative of real-world use cases
 static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     std::vector<std::unique_ptr<test_case>> test_cases;
-
     // Conv2d: K=CRS=NPQ=4096 matmul performance
     uint32_t                        iwh_idx  = 0;
     uint32_t                        kwh_idx  = 1;

From 49c7e4b1947242198bfd2167ef960f23d7757082 Mon Sep 17 00:00:00 2001
From: yael-works <yaelshuker100@gmail.com>
Date: Thu, 30 Oct 2025 14:14:51 +0200
Subject: [PATCH 10/15] Implement final optimized SparseK Attention (CPU)
 Co-authored-by: Yael <yaelshuker100@gmail.com> Co-authored-by: Gitty
 <g0534163997@gmail.com>

---
 ggml/src/ggml-cpu/ops.cpp | 152 +++++++++++++++++++++++++++-----------
 1 file changed, 110 insertions(+), 42 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 8465f4553bd3b..ce6169de3fb9b 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7909,14 +7909,30 @@ void ggml_compute_forward_argsort(
 }
 
 //------------------------------------------------------------------------------
-// SparseK Attention (CPU)
+// SparseK Attention (CPU, final optimized version)
 //------------------------------------------------------------------------------
+//
+// Implements SparseK Attention as a GGML operator for the CPU backend.
+// Features:
+//  • Top-K filtering using nth_element (O(N))
+//  • Optional local window (win_local)
+//  • Optional global stride (stride_glb)
+//  • Numerically stable softmax
+//  • Preallocated buffers for performance
+//
+// Author: Yael Shuker (yael-works)
+//------------------------------------------------------------------------------
+
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include <limits>
 
 static void ggml_compute_forward_sparsek_attn_f32(
     const struct ggml_compute_params * params,
     struct ggml_tensor * dst) {
 
-    // Single-threaded baseline version 
+    // Single-threaded baseline version
     if (params->ith != 0) return;
 
     const struct ggml_tensor * Q = dst->src[0];
@@ -7929,80 +7945,132 @@ static void ggml_compute_forward_sparsek_attn_f32(
     GGML_ASSERT(V->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
+    // Operator parameters
     const int32_t k_top      = ggml_get_op_params_i32(dst, 0);
-    const int32_t win_local  = ggml_get_op_params_i32(dst, 1);
-    const int32_t stride_glb = ggml_get_op_params_i32(dst, 2);
-    GGML_UNUSED(win_local);
-    GGML_UNUSED(stride_glb);
+    const int32_t win_local  = ggml_get_op_params_i32(dst, 1); // -1 ⇒ no local window
+    const int32_t stride_glb = ggml_get_op_params_i32(dst, 2); // ≤1 ⇒ no global stride
+
+    const bool use_local  = (win_local  >= 0);
+    const bool use_stride = (stride_glb >  1);
 
-    // Tensor dimensions according to GGML layout: ne[0]=d, ne[1]=seq, ne[2]=head, ne[3]=batch
+    // GGML tensor dimensions: ne[0]=D, ne[1]=T, ne[2]=H, ne[3]=B
     const int64_t D = Q->ne[0];
     const int64_t T = Q->ne[1];
     const int64_t H = Q->ne[2];
     const int64_t B = Q->ne[3];
 
-    // Temporary buffer for attention scores for one query row
-    std::vector<float> attn_row(T, 0.0f);
+    // Dimension validation
+    GGML_ASSERT(K->ne[0] == D && V->ne[0] == D);
+    GGML_ASSERT(K->ne[1] == T && V->ne[1] == T);
+    GGML_ASSERT(K->ne[2] == H && V->ne[2] == H);
+    GGML_ASSERT(K->ne[3] == B && V->ne[3] == B);
+
+    // Parameter sanity checks
+    GGML_ASSERT(k_top >= 0 && k_top <= (int32_t)T);
+    GGML_ASSERT(win_local >= -1);
+    GGML_ASSERT(stride_glb >= 0);
 
-    const float scale = 1.0f / sqrtf((float) D);
+    const float scale = 1.0f / sqrtf((float)D);
+    const float NINF  = -std::numeric_limits<float>::infinity();
+
+    // Preallocated buffers to avoid heap churn
+    std::vector<float>   attn_row((size_t)T, NINF);
+    std::vector<int32_t> cand_idx; cand_idx.reserve((size_t)T);
+    std::vector<float>   scores;   scores.reserve((size_t)T);
 
-    // Loops over batch, head, and query token
     for (int64_t b = 0; b < B; ++b) {
         for (int64_t h = 0; h < H; ++h) {
             for (int64_t iq = 0; iq < T; ++iq) {
 
-                // (1) Compute dot products Q·K within same (b,h)
-                const char * qbase = (const char *) Q->data + b*Q->nb[3] + h*Q->nb[2] + iq*Q->nb[1];
-                const float * qv = (const float *) qbase;
+                // (0) Build candidate index list (always include self)
+                cand_idx.clear();
+                scores.clear();
+
+                if (!use_local && !use_stride) {
+                    // No sparsity: attend to all tokens
+                    for (int64_t j = 0; j < T; ++j)
+                        cand_idx.push_back((int32_t)j);
+                } else {
+                    // Apply local window and/or global stride
+                    for (int64_t j = 0; j < T; ++j) {
+                        const int64_t dist = iq >= j ? iq - j : j - iq;
+                        const bool pass_local  = use_local  && (dist <= (int64_t)win_local);
+                        const bool pass_stride = use_stride && (stride_glb > 0 && j % stride_glb == 0);
+                        if (pass_local || pass_stride || j == iq)
+                            cand_idx.push_back((int32_t)j);
+                    }
+                }
+
+                // Edge case: no candidates or k_top==0 → output zeros
+                if (k_top == 0 || cand_idx.empty()) {
+                    float * y0 = (float *)((char *)dst->data + b*dst->nb[3] + h*dst->nb[2] + iq*dst->nb[1]);
+                    std::fill(y0, y0 + D, 0.0f);
+                    continue;
+                }
 
-                for (int64_t j = 0; j < T; ++j) {
-                    const char * kbase = (const char *) K->data + b*K->nb[3] + h*K->nb[2] + j*K->nb[1];
-                    const float * kv = (const float *) kbase;
+                // (1) Compute scaled dot-product Q·K only for candidates
+                std::fill(attn_row.begin(), attn_row.end(), NINF);
+                const float * qv = (const float *)((const char *)Q->data + b*Q->nb[3] + h*Q->nb[2] + iq*Q->nb[1]);
 
+                for (int32_t j : cand_idx) {
+                    const float * kv = (const float *)((const char *)K->data + b*K->nb[3] + h*K->nb[2] + (int64_t)j*K->nb[1]);
                     float dot = 0.0f;
-                    for (int64_t d = 0; d < D; ++d) {
+                    for (int64_t d = 0; d < D; ++d)
                         dot += qv[d] * kv[d];
-                    }
                     attn_row[j] = dot * scale;
                 }
 
-                // (2) Select top-k threshold using nth_element
-                const int kk = std::max<int>(1, std::min<int>((int)T, k_top));
-                std::vector<float> tmp(attn_row.begin(), attn_row.end());
-                std::nth_element(tmp.begin(), tmp.begin() + (kk - 1), tmp.end(), std::greater<float>());
-                const float thr = tmp[kk - 1];
+                // (2) Determine true Top-K threshold using nth_element
+                const int num_candidates = (int)cand_idx.size();
+                const int kk = std::min<int>(std::max<int>(1, k_top), num_candidates);
+
+                if (kk < num_candidates) {
+                    scores.resize((size_t)num_candidates);
+                    for (size_t i = 0; i < cand_idx.size(); ++i)
+                        scores[i] = attn_row[cand_idx[i]];
+
+                    std::nth_element(scores.begin(), scores.begin() + (kk - 1), scores.end(), std::greater<float>());
+                    const float thr = scores[kk - 1];
 
-                for (int64_t j = 0; j < T; ++j) {
-                    if (attn_row[j] < thr) attn_row[j] = -INFINITY;
+                    // Mask all values below the threshold
+                    for (int32_t j : cand_idx)
+                        if (attn_row[j] < thr) attn_row[j] = NINF;
                 }
 
-                // (3) Numerically stable softmax on the masked row
-                float maxv = -INFINITY;
-                for (int64_t j = 0; j < T; ++j) {
+                // (3) Numerically stable softmax
+                float maxv = NINF;
+                for (int32_t j : cand_idx)
                     maxv = std::max(maxv, attn_row[j]);
+
+                // Handle all-masked rows
+                if (!std::isfinite(maxv)) {
+                    float * y0 = (float *)((char *)dst->data + b*dst->nb[3] + h*dst->nb[2] + iq*dst->nb[1]);
+                    std::fill(y0, y0 + D, 0.0f);
+                    continue;
                 }
+
                 float sum = 0.0f;
-                for (int64_t j = 0; j < T; ++j) {
-                    float v = attn_row[j] - maxv;
-                    float e = expf(v);
+                for (int32_t j : cand_idx) {
+                    if (attn_row[j] == NINF) continue;
+                    const float e = expf(attn_row[j] - maxv);
                     attn_row[j] = e;
                     sum += e;
                 }
-                const float inv_sum = sum > 0.0f ? 1.0f / sum : 0.0f;
-                for (int64_t j = 0; j < T; ++j) {
+
+                const float inv_sum = (sum > 0.0f) ? (1.0f / sum) : 0.0f;
+                for (int32_t j : cand_idx) {
+                    if (attn_row[j] == NINF) continue;
                     attn_row[j] *= inv_sum;
                 }
 
-                // (4) Compute output = A·V (weighted sum)
-                float * y = (float *) ((char *) dst->data + b*dst->nb[3] + h*dst->nb[2] + iq*dst->nb[1]);
-
+                // (4) Compute output y = A·V
+                float * y = (float *)((char *)dst->data + b*dst->nb[3] + h*dst->nb[2] + iq*dst->nb[1]);
                 for (int64_t d = 0; d < D; ++d) {
                     float acc = 0.0f;
-                    for (int64_t j = 0; j < T; ++j) {
+                    for (int32_t j : cand_idx) {
                         const float aij = attn_row[j];
-                        if (aij == 0.0f) continue; // skip masked entries
-                        const char * vbase = (const char *) V->data + b*V->nb[3] + h*V->nb[2] + j*V->nb[1];
-                        const float * vv = (const float *) vbase;
+                        if (!(aij > 0.0f)) continue; // skip zero or masked
+                        const float * vv = (const float *)((const char *)V->data + b*V->nb[3] + h*V->nb[2] + (int64_t)j*V->nb[1]);
                         acc += aij * vv[d];
                     }
                     y[d] = acc;
@@ -8012,7 +8080,7 @@ static void ggml_compute_forward_sparsek_attn_f32(
     }
 
     GGML_PRINT_DEBUG("[SPARSEK CPU] k_top=%d win_local=%d stride=%d\n",
-        k_top, win_local, stride_glb);
+                     k_top, win_local, stride_glb);
 }
 
 void ggml_compute_forward_sparsek_attn(

From 939bbd9d7166b1eebf68edb09c4e1e84bac3c486 Mon Sep 17 00:00:00 2001
From: Gitty Burstein <gitty@example.com>
Date: Thu, 30 Oct 2025 17:44:20 +0200
Subject: [PATCH 11/15] style: remove trailing whitespace and fix indentation
 in test-backend-ops.cpp

Co-authored-by: Gitty Burstein <g0534163997@gmail.com>
Co-authored-by: Yael Shuker <yaelshuker100@gmail.com>
---
 ggml/src/ggml-cpu/ops.cpp  |  2 +-
 tests/test-backend-ops.cpp | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index ce6169de3fb9b..762d340761d03 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7920,7 +7920,7 @@ void ggml_compute_forward_argsort(
 //  • Numerically stable softmax
 //  • Preallocated buffers for performance
 //
-// Author: Yael Shuker (yael-works)
+// Author: Yael Shuker & Gitty Burstein
 //------------------------------------------------------------------------------
 
 #include <algorithm>
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index fd78d4e06d2c7..8da7c730e3f74 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7208,7 +7208,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                         continue;
                     }
                     for (bool with_bias : {false, true}) {
-                        if (!with_gate && !with_bias) {
+                         if (!with_gate && !with_bias) {
                             continue;
                         }
                         for (ggml_glu_op glu_op : {GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU}) {
@@ -7252,12 +7252,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     std::vector<std::unique_ptr<test_case>> test_cases;
     // Conv2d: K=CRS=NPQ=4096 matmul performance
-    uint32_t                        iwh_idx  = 0;
-    uint32_t                        kwh_idx  = 1;
-    uint32_t                        Cout_idx = 2;
-    uint32_t                        Cin_idx  = 3;
-    uint32_t                        B_idx    = 4;
-    std::vector<std::array<int, 5>> cases    = {
+    uint32_t iwh_idx = 0;
+    uint32_t kwh_idx = 1;
+    uint32_t Cout_idx = 2;
+    uint32_t Cin_idx = 3;
+    uint32_t B_idx = 4;
+    std::vector<std::array<int, 5>> cases = {
   //{IWH, KWH, Cout, Cin, B}
   // K=CRS=NPQ=4096 conv2d matmul performance
         {19,   4, 4096, 256, 16},

From 1983ab3e0464ee34f0d0e844105b6ca8be92e0f3 Mon Sep 17 00:00:00 2001
From: Gitty Burstein <gitty@example.com>
Date: Fri, 31 Oct 2025 01:37:25 +0200
Subject: [PATCH 12/15] delete Trailing whitespace Co-authored-by: Gitty
 Burstein <g0534163997@gmail.com> Co-authored-by: Yael Shuker
 <yaelshuker100@gmail.com>

---
 tests/test-backend-ops.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 8da7c730e3f74..eb69f2e669514 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7258,8 +7258,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     uint32_t Cin_idx = 3;
     uint32_t B_idx = 4;
     std::vector<std::array<int, 5>> cases = {
-  //{IWH, KWH, Cout, Cin, B}
-  // K=CRS=NPQ=4096 conv2d matmul performance
+// {IWH, KWH, Cout, Cin, B}
+// K=CRS=NPQ=4096 conv2d matmul performance
         {19,   4, 4096, 256, 16},
  // K=128, CRS=128, NPQ=4096
         { 19,  4, 128,  8,   16},

From 202c5d14115eee43c2f575db615b56b02ae1c2d3 Mon Sep 17 00:00:00 2001
From: GittyBurstein <g0534163997@gmail.com>
Date: Fri, 31 Oct 2025 01:56:19 +0200
Subject: [PATCH 13/15] Update tests/test-backend-ops.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 tests/test-backend-ops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index eb69f2e669514..839b29c44143d 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7208,7 +7208,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                         continue;
                     }
                     for (bool with_bias : {false, true}) {
-                         if (!with_gate && !with_bias) {
+                        if (!with_gate && !with_bias) {
                             continue;
                         }
                         for (ggml_glu_op glu_op : {GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU}) {

From 971296774c2c613188380d4264fa7f2e08e7f46e Mon Sep 17 00:00:00 2001
From: GittyBurstein <g0534163997@gmail.com>
Date: Fri, 31 Oct 2025 01:56:30 +0200
Subject: [PATCH 14/15] Update tests/test-backend-ops.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 tests/test-backend-ops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 839b29c44143d..937576023c956 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7135,7 +7135,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
             if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
             if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
             if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA
-          
+
             for (bool mask : { true, false } ) {
                 for (bool sinks : { true, false } ) {
                     for (float max_bias : { 0.0f, 8.0f }) {

From 77f4088b1220bd45a65668c0f0f2c35055c3cad5 Mon Sep 17 00:00:00 2001
From: GittyBurstein <g0534163997@gmail.com>
Date: Fri, 31 Oct 2025 01:56:42 +0200
Subject: [PATCH 15/15] Update tests/test-backend-ops.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 tests/test-backend-ops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 937576023c956..5350ea13e6ee6 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7178,7 +7178,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     for (int64_t d_qk : {64, 128}) {
         for (int64_t d_v : {64, 128}) {
             for (int64_t n_head : {4, 8}) {
-                for (int64_t kv : {113, 512}) {  
+                for (int64_t kv : {113, 512}) {
                     for (int64_t b : {1, 4}) {
                         for (int32_t k_top : {16, 32}) {
                             for (int32_t win_local : {32, 64}) {