Integrate SparseK Attention via FlashAttention extension (CPU backend) [yael-works]

yael-works · yael-works · commit fd967669c631 · 2025-11-03T11:21:04.000+02:00
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -219,7 +219,7 @@
 #define GGML_MAX_PARAMS         2048
 #define GGML_MAX_SRC            10
 #define GGML_MAX_N_THREADS      512
-#define GGML_MAX_OP_PARAMS      64
+#define GGML_MAX_OP_PARAMS      128
 
 #ifndef GGML_MAX_NAME
 #   define GGML_MAX_NAME        64
@@ -530,7 +530,6 @@ extern "C" {
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
         GGML_OP_LEAKY_RELU,
-        GGML_OP_SPARSEK_ATTN,
         GGML_OP_FLASH_ATTN_EXT,
         GGML_OP_FLASH_ATTN_BACK,
         GGML_OP_SSM_CONV,
@@ -2232,26 +2231,6 @@ extern "C" {
     //   n_head % ne32      == 0
     //   ne3    % ne33      == 0
     //
-
-    GGML_API struct ggml_tensor * ggml_sparsek_attn(
-              struct ggml_context * ctx,
-              struct ggml_tensor  * Q,
-              struct ggml_tensor  * K,
-              struct ggml_tensor  * V,
-              int32_t               k_top,
-              int32_t               win_local,
-              int32_t               stride_global);
-
-    GGML_API void ggml_sparsek_attn_set_params(
-             struct ggml_tensor * a,
-             int32_t k_top,
-             int32_t win_local,
-             int32_t stride_global);
-
-    GGML_API int32_t ggml_sparsek_attn_get_param(
-             const struct ggml_tensor * a,
-             int index);
-
     GGML_API struct ggml_tensor * ggml_flash_attn_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * q,
@@ -2281,6 +2260,20 @@ extern "C" {
            struct ggml_tensor  * v,
            struct ggml_tensor  * d,
            bool                  masked);
+    // Optional SparseK parameters (disabled if use_sparsek=false)
+    GGML_API void ggml_flash_attn_ext_set_sparsek(
+           struct ggml_tensor * a,
+           bool use_sparsek,
+           int32_t k_top,
+           int32_t win_local,
+           int32_t stride_global);
+
+    GGML_API void ggml_flash_attn_ext_get_sparsek(
+           const struct ggml_tensor * a,
+           bool * use_sparsek,
+           int32_t * k_top,
+           int32_t * win_local,
+           int32_t * stride_global);
 
     GGML_API struct ggml_tensor * ggml_ssm_conv(
             struct ggml_context * ctx,
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1947,10 +1947,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_flash_attn_ext(params, tensor);
             } break;
-        case GGML_OP_SPARSEK_ATTN:
-            {
-                ggml_compute_forward_sparsek_attn(params, tensor);
-            } break;
         case GGML_OP_FLASH_ATTN_BACK:
             {
                 int32_t t = ggml_get_op_params_i32(tensor, 0);
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -5107,6 +5107,14 @@ static void ggml_compute_forward_soft_max_f32(
     const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
+    // SparseK parameters (from op_params)
+    const bool use_sparsek   = ggml_get_op_params_i32(dst, 30) != 0;
+    const int32_t k_top      = ggml_get_op_params_i32(dst, 31);
+    const int32_t win_local  = ggml_get_op_params_i32(dst, 32);
+    const int32_t stride_glb = ggml_get_op_params_i32(dst, 33);
+    (void)use_sparsek; (void)k_top; (void)win_local; (void)stride_glb;
+
+
     float * wp = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
 
     const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
@@ -8182,6 +8190,13 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
     const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
+    // -------- SparseK op_params (לא משנה שום דבר חוץ מקריאת הפרמטרים) --------
+    const bool    use_sparsek   = ggml_get_op_params_i32(dst, 30) != 0;
+    const int32_t k_top         = ggml_get_op_params_i32(dst, 31);
+    const int32_t win_local     = ggml_get_op_params_i32(dst, 32);
+    const int32_t stride_glb    = ggml_get_op_params_i32(dst, 33);
+    // ----------------------------------------------------------------------------
+
     ggml_type         const k_vec_dot_type = ggml_get_type_traits_cpu(k->type)->vec_dot_type;
     ggml_from_float_t const q_to_vec_dot   = ggml_get_type_traits_cpu(k_vec_dot_type)->from_float;
     ggml_vec_dot_t    const kq_vec_dot     = ggml_get_type_traits_cpu(k->type)->vec_dot;
@@ -8200,7 +8215,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
         const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
 
         const uint32_t h = iq2; // head index
-        const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
+        const float slope = (max_bias > 0.0f) ? (h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1)) : 1.0f;
 
         float S = 0.0f;      // sum
         float M = -INFINITY; // maximum KQ value
@@ -8229,18 +8244,51 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
         const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
         q_to_vec_dot(pq, Q_q, DK);
 
-        // online softmax / attention
-        // loop over n_kv and n_head_kv
-        // ref: https://arxiv.org/pdf/2112.05682.pdf
-        for (int64_t ic = 0; ic < nek1; ++ic) {
+        // ------------------------ SparseK: בניית רשימת מועמדים ------------------------
+        std::vector<int> cand_idx;
+        cand_idx.reserve((size_t)nek1);
+
+        if (!use_sparsek) {
+            for (int64_t t = 0; t < nek1; ++t) cand_idx.push_back((int)t);
+        } else {
+            for (int64_t t = 0; t < nek1; ++t) {
+                const int dist = std::abs((int)iq1 - (int)t);
+                const bool in_local  = (win_local >= 0 && dist <= win_local);
+                const bool in_stride = (stride_glb > 1 && (t % stride_glb) == 0);
+                if (in_local || in_stride || t == iq1) cand_idx.push_back((int)t);
+            }
+            if (k_top > 0 && (int)cand_idx.size() > k_top) {
+                std::vector<float> vals; vals.reserve(cand_idx.size());
+                for (int idx : cand_idx) {
+                    float tmp_s;
+                    const char * k_data = (const char *) k->data + (idx*nbk1 + ik2*nbk2 + ik3*nbk3);
+                    kq_vec_dot(DK, &tmp_s, 0, k_data, 0, Q_q, 0, 1);
+                    vals.push_back(tmp_s * scale);
+                }
+                std::nth_element(vals.begin(), vals.begin() + (k_top - 1), vals.end(), std::greater<float>());
+                const float thr = vals[k_top - 1];
+
+                std::vector<int> filtered; filtered.reserve(k_top);
+                for (int idx : cand_idx) {
+                    float tmp_s;
+                    const char * k_data = (const char *) k->data + (idx*nbk1 + ik2*nbk2 + ik3*nbk3);
+                    kq_vec_dot(DK, &tmp_s, 0, k_data, 0, Q_q, 0, 1);
+                    if (tmp_s * scale >= thr) filtered.push_back(idx);
+                }
+                cand_idx.swap(filtered);
+            }
+        }
+        // ------------------------------------------------------------------------------
+
+        // ----- ליבת Flash Attention: אותו קוד, רק ריצה על cand_idx במקום כל ic -----
+        for (int ic : cand_idx) {
             const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f;
             if (mv == -INFINITY) {
                 continue;
             }
 
             float s; // KQ value
-
-            const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
+            const char * k_data = (const char *) k->data + (ic*nbk1 + ik2*nbk2 + ik3*nbk3);
             kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1);
 
             s = s*scale; // scale KQ value
@@ -8260,44 +8308,33 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
 
             if (v->type == GGML_TYPE_F16) {
                 if (s > M) {
-                    // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
                     M = s;
                     ms = expf(Mold - M);
-
-                    // V = V*expf(Mold - M)
                     ggml_vec_scale_f16(DV, VKQ16, ms);
                 } else {
-                    // no new maximum, ms == 1.0f, vs != 1.0f
                     vs = expf(s - M);
                 }
-
-                // V += v*expf(s - M)
                 ggml_vec_mad_f16(DV, VKQ16, (const ggml_fp16_t *) v_data, vs);
             } else {
                 if (s > M) {
-                    // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
                     M = s;
                     ms = expf(Mold - M);
-
-                    // V = V*expf(Mold - M)
                     ggml_vec_scale_f32(DV, VKQ32, ms);
                 } else {
-                    // no new maximum, ms == 1.0f, vs != 1.0f
                     vs = expf(s - M);
                 }
 
-                // V += v*expf(s - M)
                 if (v_to_float) {
                     v_to_float(v_data, V32, DV);
                     ggml_vec_mad_f32(DV, VKQ32, V32, vs);
                 } else {
-                    // V is F32
                     ggml_vec_mad_f32(DV, VKQ32, (const float *) v_data, vs);
                 }
             }
 
             S = S*ms + vs; // scale and increment sum with partial sum
         }
+        // ------------------------------------------------------------------------------
 
         if (v->type == GGML_TYPE_F16) {
             for (int64_t d = 0; d < DV; ++d) {
@@ -8331,9 +8368,6 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
         const int i2 = iq2;
         const int i3 = iq3;
 
-        // original
-        //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
-
         // permute(0, 2, 1, 3)
         memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
     }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -990,7 +990,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
     "LEAKY_RELU",
-    "SPARSEK_ATTN",
+    
     "FLASH_ATTN_EXT",
     "FLASH_ATTN_BACK",
     "SSM_CONV",
@@ -1019,7 +1019,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
+static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1094,7 +1094,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
     "leaky_relu(x)",
-    "sparsek_attn(x)",
     "flash_attn_ext(x)",
     "flash_attn_back(x)",
     "ssm_conv(x)",
@@ -1123,7 +1122,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x)",
 };
 
-static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
+static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -5063,52 +5062,6 @@ struct ggml_tensor * ggml_top_k(
     return result;
 }
 
-// ggml_sparsek_attn
-struct ggml_tensor * ggml_sparsek_attn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * Q,
-        struct ggml_tensor  * K,
-        struct ggml_tensor  * V,
-        int32_t               k_top,
-        int32_t               win_local,
-        int32_t               stride_global) {
-
-    GGML_ASSERT(ggml_can_mul_mat(K, Q));
-    GGML_ASSERT(Q->ne[3] == K->ne[3] && Q->ne[3] == V->ne[3]);
-
-    int64_t ne[4] = { V->ne[0], Q->ne[2], Q->ne[1], Q->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-
-    int32_t params_i32[3] = { k_top, win_local, stride_global };
-    ggml_set_op_params(result, params_i32, sizeof(params_i32));
-
-    result->op     = GGML_OP_SPARSEK_ATTN;
-    result->src[0] = Q;
-    result->src[1] = K;
-    result->src[2] = V;
-
-    return result;
-}
-
-
-void ggml_sparsek_attn_set_params(struct ggml_tensor * a,
-                                  int32_t k_top,
-                                  int32_t win_local,
-                                  int32_t stride_global) {
-    GGML_ASSERT(a->op == GGML_OP_SPARSEK_ATTN);
-    ggml_set_op_params_i32(a, 0, k_top);
-    ggml_set_op_params_i32(a, 1, win_local);
-    ggml_set_op_params_i32(a, 2, stride_global);
-}
-
-int32_t ggml_sparsek_attn_get_param(const struct ggml_tensor * a, int index) {
-    GGML_ASSERT(a->op == GGML_OP_SPARSEK_ATTN);
-    return ggml_get_op_params_i32(a, index);
-}
-
-
-
 // ggml_flash_attn_ext
 
 struct ggml_tensor * ggml_flash_attn_ext(
@@ -5262,6 +5215,40 @@ struct ggml_tensor * ggml_flash_attn_back(
     return result;
 }
 
+#define GGML_FA_EXT_PARAM_SPARSEK_FLAG   30
+#define GGML_FA_EXT_PARAM_SPARSEK_KTOP   31
+#define GGML_FA_EXT_PARAM_SPARSEK_WIN    32
+#define GGML_FA_EXT_PARAM_SPARSEK_STRIDE 33
+
+void ggml_flash_attn_ext_set_sparsek(struct ggml_tensor * a,
+                                     bool use_sparsek,
+                                     int32_t k_top,
+                                     int32_t win_local,
+                                     int32_t stride_global) {
+    GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
+    a->op_params[GGML_FA_EXT_PARAM_SPARSEK_FLAG]   = use_sparsek ? 1 : 0;
+    a->op_params[GGML_FA_EXT_PARAM_SPARSEK_KTOP]   = k_top;
+    a->op_params[GGML_FA_EXT_PARAM_SPARSEK_WIN]    = win_local;
+    a->op_params[GGML_FA_EXT_PARAM_SPARSEK_STRIDE] = stride_global;
+}
+
+void ggml_flash_attn_ext_get_sparsek(const struct ggml_tensor * a,
+                                     bool * use_sparsek,
+                                     int32_t * k_top,
+                                     int32_t * win_local,
+                                     int32_t * stride_global) {
+    GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
+    if (use_sparsek)
+        *use_sparsek = a->op_params[GGML_FA_EXT_PARAM_SPARSEK_FLAG] != 0;
+    if (k_top)
+        *k_top = a->op_params[GGML_FA_EXT_PARAM_SPARSEK_KTOP];
+    if (win_local)
+        *win_local = a->op_params[GGML_FA_EXT_PARAM_SPARSEK_WIN];
+    if (stride_global)
+        *stride_global = a->op_params[GGML_FA_EXT_PARAM_SPARSEK_STRIDE];
+}
+
+
 // ggml_ssm_conv
 
 struct ggml_tensor * ggml_ssm_conv(
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -5513,12 +5513,21 @@ struct test_sparsek_attn : public test_case {
         ggml_set_name(K, "K");
         ggml_tensor * V = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_v, n_tokens, n_head, batch);
         ggml_set_name(V, "V");
+        // ----------------------------------------------------------------------------
+        // SparseK Attention test (integrated via FlashAttention extension)
+        // ----------------------------------------------------------------------------
+        ggml_tensor * mask = NULL;
+        float scale = 1.0f;
+        float max_bias = 0.0f;
+        float bias = 0.0f;
+        ggml_tensor * out = ggml_flash_attn_ext(ctx, Q, K, V, mask, scale, max_bias, bias);
+        ggml_flash_attn_ext_set_sparsek(out, true, k_top, win_local, stride_global);
+
+        ggml_set_name(out, "FLASH_ATTN_EXT_with_SPARSEK");
+        return out;
 
-        ggml_tensor * out = ggml_sparsek_attn(ctx, Q, K, V, k_top, win_local, stride_global);
-        ggml_set_name(out, "SPARSEK_ATTN_out");
 
-        return out;
-    }
+            }
 };
 
 

Original file line number	Diff line number	Diff line change
`@@ -1947,10 +1947,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm`
`1947`	`1947`	`{`
`1948`	`1948`	`ggml_compute_forward_flash_attn_ext(params, tensor);`
`1949`	`1949`	`} break;`
`1950`		`- case GGML_OP_SPARSEK_ATTN:`
`1951`		`- {`
`1952`		`- ggml_compute_forward_sparsek_attn(params, tensor);`
`1953`		`- } break;`
`1954`	`1950`	`case GGML_OP_FLASH_ATTN_BACK:`
`1955`	`1951`	`{`
`1956`	`1952`	`int32_t t = ggml_get_op_params_i32(tensor, 0);`